xref: /libCEED/rust/libceed-sys/c-src/backends/hip-ref/ceed-hip-ref-restriction.c (revision d7d111ec11fb843d4be607afc8fa89b13b3a5edf)
10d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
20d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
30d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details.
40d0321e0SJeremy L Thompson //
50d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software
60d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral
70d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and
80d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed.
90d0321e0SJeremy L Thompson //
100d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
110d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office
120d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for
130d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including
140d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early
150d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative.
160d0321e0SJeremy L Thompson 
170d0321e0SJeremy L Thompson #include <ceed/ceed.h>
180d0321e0SJeremy L Thompson #include <ceed/backend.h>
19437930d1SJeremy L Thompson #include <ceed/jit-tools.h>
200d0321e0SJeremy L Thompson #include <hip/hip_runtime.h>
210d0321e0SJeremy L Thompson #include <stdbool.h>
220d0321e0SJeremy L Thompson #include <stddef.h>
230d0321e0SJeremy L Thompson #include "ceed-hip-ref.h"
240d0321e0SJeremy L Thompson #include "../hip/ceed-hip-compile.h"
250d0321e0SJeremy L Thompson 
260d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
270d0321e0SJeremy L Thompson // Apply restriction
280d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
290d0321e0SJeremy L Thompson static int CeedElemRestrictionApply_Hip(CeedElemRestriction r,
3046dc0734SJeremy L Thompson                                         CeedTransposeMode t_mode, CeedVector u,
3146dc0734SJeremy L Thompson                                         CeedVector v, CeedRequest *request) {
320d0321e0SJeremy L Thompson   int ierr;
330d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
340d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
350d0321e0SJeremy L Thompson   Ceed ceed;
360d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
370d0321e0SJeremy L Thompson   Ceed_Hip *data;
380d0321e0SJeremy L Thompson   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
39437930d1SJeremy L Thompson   const CeedInt block_size = 64;
40437930d1SJeremy L Thompson   const CeedInt num_nodes = impl->num_nodes;
41437930d1SJeremy L Thompson   CeedInt num_elem, elem_size;
42437930d1SJeremy L Thompson   CeedElemRestrictionGetNumElements(r, &num_elem);
43437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
440d0321e0SJeremy L Thompson   hipFunction_t kernel;
450d0321e0SJeremy L Thompson 
460d0321e0SJeremy L Thompson   // Get vectors
470d0321e0SJeremy L Thompson   const CeedScalar *d_u;
480d0321e0SJeremy L Thompson   CeedScalar *d_v;
490d0321e0SJeremy L Thompson   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
50437930d1SJeremy L Thompson   if (t_mode == CEED_TRANSPOSE) {
510d0321e0SJeremy L Thompson     // Sum into for transpose mode, e-vec to l-vec
520d0321e0SJeremy L Thompson     ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
530d0321e0SJeremy L Thompson   } else {
540d0321e0SJeremy L Thompson     // Overwrite for notranspose mode, l-vec to e-vec
550d0321e0SJeremy L Thompson     ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
560d0321e0SJeremy L Thompson   }
570d0321e0SJeremy L Thompson 
580d0321e0SJeremy L Thompson   // Restrict
59437930d1SJeremy L Thompson   if (t_mode == CEED_NOTRANSPOSE) {
600d0321e0SJeremy L Thompson     // L-vector -> E-vector
610d0321e0SJeremy L Thompson     if (impl->d_ind) {
620d0321e0SJeremy L Thompson       // -- Offsets provided
63437930d1SJeremy L Thompson       kernel = impl->OffsetNoTranspose;
64437930d1SJeremy L Thompson       void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
65437930d1SJeremy L Thompson       CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256;
66437930d1SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
67437930d1SJeremy L Thompson                               block_size, args); CeedChkBackend(ierr);
680d0321e0SJeremy L Thompson     } else {
690d0321e0SJeremy L Thompson       // -- Strided restriction
70437930d1SJeremy L Thompson       kernel = impl->StridedNoTranspose;
71437930d1SJeremy L Thompson       void *args[] = {&num_elem, &d_u, &d_v};
72437930d1SJeremy L Thompson       CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256;
73437930d1SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
74437930d1SJeremy L Thompson                               block_size, args); CeedChkBackend(ierr);
750d0321e0SJeremy L Thompson     }
760d0321e0SJeremy L Thompson   } else {
770d0321e0SJeremy L Thompson     // E-vector -> L-vector
780d0321e0SJeremy L Thompson     if (impl->d_ind) {
790d0321e0SJeremy L Thompson       // -- Offsets provided
80437930d1SJeremy L Thompson       kernel = impl->OffsetTranspose;
81437930d1SJeremy L Thompson       void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices,
82437930d1SJeremy L Thompson                       &impl->d_t_offsets, &d_u, &d_v
830d0321e0SJeremy L Thompson                      };
84437930d1SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
85437930d1SJeremy L Thompson                               block_size, args); CeedChkBackend(ierr);
860d0321e0SJeremy L Thompson     } else {
870d0321e0SJeremy L Thompson       // -- Strided restriction
88437930d1SJeremy L Thompson       kernel = impl->StridedTranspose;
89437930d1SJeremy L Thompson       void *args[] = {&num_elem, &d_u, &d_v};
90437930d1SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
91437930d1SJeremy L Thompson                               block_size, args); CeedChkBackend(ierr);
920d0321e0SJeremy L Thompson     }
930d0321e0SJeremy L Thompson   }
940d0321e0SJeremy L Thompson 
950d0321e0SJeremy L Thompson   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
960d0321e0SJeremy L Thompson     *request = NULL;
970d0321e0SJeremy L Thompson 
980d0321e0SJeremy L Thompson   // Restore arrays
990d0321e0SJeremy L Thompson   ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
1000d0321e0SJeremy L Thompson   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
1010d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
1020d0321e0SJeremy L Thompson }
1030d0321e0SJeremy L Thompson 
1040d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1050d0321e0SJeremy L Thompson // Blocked not supported
1060d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1070d0321e0SJeremy L Thompson int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block,
108437930d1SJeremy L Thompson                                       CeedTransposeMode t_mode, CeedVector u,
1090d0321e0SJeremy L Thompson                                       CeedVector v, CeedRequest *request) {
1100d0321e0SJeremy L Thompson   // LCOV_EXCL_START
1110d0321e0SJeremy L Thompson   int ierr;
1120d0321e0SJeremy L Thompson   Ceed ceed;
1130d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
1140d0321e0SJeremy L Thompson   return CeedError(ceed, CEED_ERROR_BACKEND,
1150d0321e0SJeremy L Thompson                    "Backend does not implement blocked restrictions");
1160d0321e0SJeremy L Thompson   // LCOV_EXCL_STOP
1170d0321e0SJeremy L Thompson }
1180d0321e0SJeremy L Thompson 
1190d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1200d0321e0SJeremy L Thompson // Get offsets
1210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1220d0321e0SJeremy L Thompson static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr,
1230d0321e0SJeremy L Thompson     CeedMemType mtype, const CeedInt **offsets) {
1240d0321e0SJeremy L Thompson   int ierr;
1250d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
1260d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr);
1270d0321e0SJeremy L Thompson 
1280d0321e0SJeremy L Thompson   switch (mtype) {
1290d0321e0SJeremy L Thompson   case CEED_MEM_HOST:
1300d0321e0SJeremy L Thompson     *offsets = impl->h_ind;
1310d0321e0SJeremy L Thompson     break;
1320d0321e0SJeremy L Thompson   case CEED_MEM_DEVICE:
1330d0321e0SJeremy L Thompson     *offsets = impl->d_ind;
1340d0321e0SJeremy L Thompson     break;
1350d0321e0SJeremy L Thompson   }
1360d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
1370d0321e0SJeremy L Thompson }
1380d0321e0SJeremy L Thompson 
1390d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1400d0321e0SJeremy L Thompson // Destroy restriction
1410d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1420d0321e0SJeremy L Thompson static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) {
1430d0321e0SJeremy L Thompson   int ierr;
1440d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
1450d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
1460d0321e0SJeremy L Thompson 
1470d0321e0SJeremy L Thompson   Ceed ceed;
1480d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
1490d0321e0SJeremy L Thompson   ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr);
1500d0321e0SJeremy L Thompson   ierr = CeedFree(&impl->h_ind_allocated); CeedChkBackend(ierr);
1510d0321e0SJeremy L Thompson   ierr = hipFree(impl->d_ind_allocated); CeedChk_Hip(ceed, ierr);
152437930d1SJeremy L Thompson   ierr = hipFree(impl->d_t_offsets); CeedChk_Hip(ceed, ierr);
153437930d1SJeremy L Thompson   ierr = hipFree(impl->d_t_indices); CeedChk_Hip(ceed, ierr);
154437930d1SJeremy L Thompson   ierr = hipFree(impl->d_l_vec_indices); CeedChk_Hip(ceed, ierr);
1550d0321e0SJeremy L Thompson   ierr = CeedFree(&impl); CeedChkBackend(ierr);
156437930d1SJeremy L Thompson 
1570d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
1580d0321e0SJeremy L Thompson }
1590d0321e0SJeremy L Thompson 
1600d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1610d0321e0SJeremy L Thompson // Create transpose offsets and indices
1620d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1630d0321e0SJeremy L Thompson static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r,
1640d0321e0SJeremy L Thompson     const CeedInt *indices) {
1650d0321e0SJeremy L Thompson   int ierr;
1660d0321e0SJeremy L Thompson   Ceed ceed;
1670d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
1680d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
1690d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
170437930d1SJeremy L Thompson   CeedInt num_elem, elem_size, l_size, num_comp;
171437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr);
172437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
173437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr);
174437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr);
1750d0321e0SJeremy L Thompson 
176437930d1SJeremy L Thompson   // Count num_nodes
177437930d1SJeremy L Thompson   bool *is_node;
178437930d1SJeremy L Thompson   ierr = CeedCalloc(l_size, &is_node); CeedChkBackend(ierr);
179437930d1SJeremy L Thompson   const CeedInt size_indices = num_elem * elem_size;
180437930d1SJeremy L Thompson   for (CeedInt i = 0; i < size_indices; i++)
181437930d1SJeremy L Thompson     is_node[indices[i]] = 1;
182437930d1SJeremy L Thompson   CeedInt num_nodes = 0;
183437930d1SJeremy L Thompson   for (CeedInt i = 0; i < l_size; i++)
184437930d1SJeremy L Thompson     num_nodes += is_node[i];
185437930d1SJeremy L Thompson   impl->num_nodes = num_nodes;
1860d0321e0SJeremy L Thompson 
1870d0321e0SJeremy L Thompson   // L-vector offsets array
188437930d1SJeremy L Thompson   CeedInt *ind_to_offset, *l_vec_indices;
189437930d1SJeremy L Thompson   ierr = CeedCalloc(l_size, &ind_to_offset); CeedChkBackend(ierr);
190437930d1SJeremy L Thompson   ierr = CeedCalloc(num_nodes, &l_vec_indices); CeedChkBackend(ierr);
1910d0321e0SJeremy L Thompson   CeedInt j = 0;
192437930d1SJeremy L Thompson   for (CeedInt i = 0; i < l_size; i++)
193437930d1SJeremy L Thompson     if (is_node[i]) {
194437930d1SJeremy L Thompson       l_vec_indices[j] = i;
1950d0321e0SJeremy L Thompson       ind_to_offset[i] = j++;
1960d0321e0SJeremy L Thompson     }
197437930d1SJeremy L Thompson   ierr = CeedFree(&is_node); CeedChkBackend(ierr);
1980d0321e0SJeremy L Thompson 
1990d0321e0SJeremy L Thompson   // Compute transpose offsets and indices
200437930d1SJeremy L Thompson   const CeedInt size_offsets = num_nodes + 1;
201437930d1SJeremy L Thompson   CeedInt *t_offsets;
202437930d1SJeremy L Thompson   ierr = CeedCalloc(size_offsets, &t_offsets); CeedChkBackend(ierr);
203437930d1SJeremy L Thompson   CeedInt *t_indices;
204437930d1SJeremy L Thompson   ierr = CeedMalloc(size_indices, &t_indices); CeedChkBackend(ierr);
2050d0321e0SJeremy L Thompson   // Count node multiplicity
206437930d1SJeremy L Thompson   for (CeedInt e = 0; e < num_elem; ++e)
207437930d1SJeremy L Thompson     for (CeedInt i = 0; i < elem_size; ++i)
208437930d1SJeremy L Thompson       ++t_offsets[ind_to_offset[indices[elem_size*e + i]] + 1];
2090d0321e0SJeremy L Thompson   // Convert to running sum
210437930d1SJeremy L Thompson   for (CeedInt i = 1; i < size_offsets; ++i)
211437930d1SJeremy L Thompson     t_offsets[i] += t_offsets[i-1];
2120d0321e0SJeremy L Thompson   // List all E-vec indices associated with L-vec node
213437930d1SJeremy L Thompson   for (CeedInt e = 0; e < num_elem; ++e) {
214437930d1SJeremy L Thompson     for (CeedInt i = 0; i < elem_size; ++i) {
215437930d1SJeremy L Thompson       const CeedInt lid = elem_size*e + i;
2160d0321e0SJeremy L Thompson       const CeedInt gid = indices[lid];
217437930d1SJeremy L Thompson       t_indices[t_offsets[ind_to_offset[gid]]++] = lid;
2180d0321e0SJeremy L Thompson     }
2190d0321e0SJeremy L Thompson   }
2200d0321e0SJeremy L Thompson   // Reset running sum
221437930d1SJeremy L Thompson   for (int i = size_offsets - 1; i > 0; --i)
222437930d1SJeremy L Thompson     t_offsets[i] = t_offsets[i - 1];
223437930d1SJeremy L Thompson   t_offsets[0] = 0;
2240d0321e0SJeremy L Thompson 
2250d0321e0SJeremy L Thompson   // Copy data to device
2260d0321e0SJeremy L Thompson   // -- L-vector indices
227437930d1SJeremy L Thompson   ierr = hipMalloc((void **)&impl->d_l_vec_indices, num_nodes*sizeof(CeedInt));
2280d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
229437930d1SJeremy L Thompson   ierr = hipMemcpy(impl->d_l_vec_indices, l_vec_indices,
230437930d1SJeremy L Thompson                    num_nodes*sizeof(CeedInt), hipMemcpyHostToDevice);
2310d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
2320d0321e0SJeremy L Thompson   // -- Transpose offsets
233437930d1SJeremy L Thompson   ierr = hipMalloc((void **)&impl->d_t_offsets, size_offsets*sizeof(CeedInt));
2340d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
235437930d1SJeremy L Thompson   ierr = hipMemcpy(impl->d_t_offsets, t_offsets, size_offsets*sizeof(CeedInt),
2360d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
2370d0321e0SJeremy L Thompson   // -- Transpose indices
238437930d1SJeremy L Thompson   ierr = hipMalloc((void **)&impl->d_t_indices, size_indices*sizeof(CeedInt));
2390d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
240437930d1SJeremy L Thompson   ierr = hipMemcpy(impl->d_t_indices, t_indices, size_indices*sizeof(CeedInt),
2410d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
2420d0321e0SJeremy L Thompson 
2430d0321e0SJeremy L Thompson   // Cleanup
2440d0321e0SJeremy L Thompson   ierr = CeedFree(&ind_to_offset); CeedChkBackend(ierr);
245437930d1SJeremy L Thompson   ierr = CeedFree(&l_vec_indices); CeedChkBackend(ierr);
246437930d1SJeremy L Thompson   ierr = CeedFree(&t_offsets); CeedChkBackend(ierr);
247437930d1SJeremy L Thompson   ierr = CeedFree(&t_indices); CeedChkBackend(ierr);
248437930d1SJeremy L Thompson 
2490d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2500d0321e0SJeremy L Thompson }
2510d0321e0SJeremy L Thompson 
2520d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
2530d0321e0SJeremy L Thompson // Create restriction
2540d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
2550d0321e0SJeremy L Thompson int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode,
2560d0321e0SJeremy L Thompson                                   const CeedInt *indices,
2570d0321e0SJeremy L Thompson                                   CeedElemRestriction r) {
2580d0321e0SJeremy L Thompson   int ierr;
2590d0321e0SJeremy L Thompson   Ceed ceed;
2600d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
2610d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
2620d0321e0SJeremy L Thompson   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
263437930d1SJeremy L Thompson   CeedInt num_elem, num_comp, elem_size;
264437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr);
265437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr);
266437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
267437930d1SJeremy L Thompson   CeedInt size = num_elem * elem_size;
268437930d1SJeremy L Thompson   CeedInt strides[3] = {1, size, elem_size};
269437930d1SJeremy L Thompson   CeedInt comp_stride = 1;
2700d0321e0SJeremy L Thompson 
2710d0321e0SJeremy L Thompson   // Stride data
272437930d1SJeremy L Thompson   bool is_strided;
273437930d1SJeremy L Thompson   ierr = CeedElemRestrictionIsStrided(r, &is_strided); CeedChkBackend(ierr);
274437930d1SJeremy L Thompson   if (is_strided) {
275437930d1SJeremy L Thompson     bool has_backend_strides;
276437930d1SJeremy L Thompson     ierr = CeedElemRestrictionHasBackendStrides(r, &has_backend_strides);
2770d0321e0SJeremy L Thompson     CeedChkBackend(ierr);
278437930d1SJeremy L Thompson     if (!has_backend_strides) {
2790d0321e0SJeremy L Thompson       ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr);
2800d0321e0SJeremy L Thompson     }
2810d0321e0SJeremy L Thompson   } else {
282437930d1SJeremy L Thompson     ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr);
2830d0321e0SJeremy L Thompson   }
2840d0321e0SJeremy L Thompson 
2850d0321e0SJeremy L Thompson   impl->h_ind           = NULL;
2860d0321e0SJeremy L Thompson   impl->h_ind_allocated = NULL;
2870d0321e0SJeremy L Thompson   impl->d_ind           = NULL;
2880d0321e0SJeremy L Thompson   impl->d_ind_allocated = NULL;
289437930d1SJeremy L Thompson   impl->d_t_indices     = NULL;
290437930d1SJeremy L Thompson   impl->d_t_offsets     = NULL;
291437930d1SJeremy L Thompson   impl->num_nodes = size;
2920d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr);
293437930d1SJeremy L Thompson   CeedInt layout[3] = {1, elem_size*num_elem, elem_size};
2940d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr);
2950d0321e0SJeremy L Thompson 
2960d0321e0SJeremy L Thompson   // Set up device indices/offset arrays
2970d0321e0SJeremy L Thompson   if (mtype == CEED_MEM_HOST) {
2980d0321e0SJeremy L Thompson     switch (cmode) {
2990d0321e0SJeremy L Thompson     case CEED_OWN_POINTER:
3000d0321e0SJeremy L Thompson       impl->h_ind_allocated = (CeedInt *)indices;
3010d0321e0SJeremy L Thompson       impl->h_ind = (CeedInt *)indices;
3020d0321e0SJeremy L Thompson       break;
3030d0321e0SJeremy L Thompson     case CEED_USE_POINTER:
3040d0321e0SJeremy L Thompson       impl->h_ind = (CeedInt *)indices;
3050d0321e0SJeremy L Thompson       break;
3060d0321e0SJeremy L Thompson     case CEED_COPY_VALUES:
3070d0321e0SJeremy L Thompson       break;
3080d0321e0SJeremy L Thompson     }
3090d0321e0SJeremy L Thompson     if (indices != NULL) {
3100d0321e0SJeremy L Thompson       ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt));
3110d0321e0SJeremy L Thompson       CeedChk_Hip(ceed, ierr);
3120d0321e0SJeremy L Thompson       impl->d_ind_allocated = impl->d_ind; // We own the device memory
3130d0321e0SJeremy L Thompson       ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt),
3140d0321e0SJeremy L Thompson                        hipMemcpyHostToDevice);
3150d0321e0SJeremy L Thompson       CeedChk_Hip(ceed, ierr);
3160d0321e0SJeremy L Thompson       ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr);
3170d0321e0SJeremy L Thompson     }
3180d0321e0SJeremy L Thompson   } else if (mtype == CEED_MEM_DEVICE) {
3190d0321e0SJeremy L Thompson     switch (cmode) {
3200d0321e0SJeremy L Thompson     case CEED_COPY_VALUES:
3210d0321e0SJeremy L Thompson       if (indices != NULL) {
3220d0321e0SJeremy L Thompson         ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt));
3230d0321e0SJeremy L Thompson         CeedChk_Hip(ceed, ierr);
3240d0321e0SJeremy L Thompson         impl->d_ind_allocated = impl->d_ind; // We own the device memory
3250d0321e0SJeremy L Thompson         ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt),
3260d0321e0SJeremy L Thompson                          hipMemcpyDeviceToDevice);
3270d0321e0SJeremy L Thompson         CeedChk_Hip(ceed, ierr);
3280d0321e0SJeremy L Thompson       }
3290d0321e0SJeremy L Thompson       break;
3300d0321e0SJeremy L Thompson     case CEED_OWN_POINTER:
3310d0321e0SJeremy L Thompson       impl->d_ind = (CeedInt *)indices;
3320d0321e0SJeremy L Thompson       impl->d_ind_allocated = impl->d_ind;
3330d0321e0SJeremy L Thompson       break;
3340d0321e0SJeremy L Thompson     case CEED_USE_POINTER:
3350d0321e0SJeremy L Thompson       impl->d_ind = (CeedInt *)indices;
3360d0321e0SJeremy L Thompson     }
3370d0321e0SJeremy L Thompson     if (indices != NULL) {
3380d0321e0SJeremy L Thompson       ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr);
3390d0321e0SJeremy L Thompson     }
3400d0321e0SJeremy L Thompson   } else {
3410d0321e0SJeremy L Thompson     // LCOV_EXCL_START
3420d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
3430d0321e0SJeremy L Thompson                      "Only MemType = HOST or DEVICE supported");
3440d0321e0SJeremy L Thompson     // LCOV_EXCL_STOP
3450d0321e0SJeremy L Thompson   }
3460d0321e0SJeremy L Thompson 
3470d0321e0SJeremy L Thompson   // Compile HIP kernels
348437930d1SJeremy L Thompson   CeedInt num_nodes = impl->num_nodes;
349437930d1SJeremy L Thompson   char *restriction_kernel_path, *restriction_kernel_source;
350437930d1SJeremy L Thompson   ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-restriction.h",
351437930d1SJeremy L Thompson                              &restriction_kernel_path); CeedChkBackend(ierr);
35246dc0734SJeremy L Thompson   CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source -----\n");
353437930d1SJeremy L Thompson   ierr = CeedLoadSourceToBuffer(ceed, restriction_kernel_path,
354437930d1SJeremy L Thompson                                 &restriction_kernel_source);
355437930d1SJeremy L Thompson   CeedChkBackend(ierr);
35646dc0734SJeremy L Thompson   CeedDebug256(ceed, 2,
35746dc0734SJeremy L Thompson                "----- Loading Restriction Kernel Source Complete! -----\n");
358437930d1SJeremy L Thompson   ierr = CeedCompileHip(ceed, restriction_kernel_source, &impl->module, 8,
359*d7d111ecSJeremy L Thompson                         "RESTR_ELEM_SIZE", elem_size,
360*d7d111ecSJeremy L Thompson                         "RESTR_NUM_ELEM", num_elem,
361*d7d111ecSJeremy L Thompson                         "RESTR_NUM_COMP", num_comp,
362*d7d111ecSJeremy L Thompson                         "RESTR_NUM_NODES", num_nodes,
363*d7d111ecSJeremy L Thompson                         "RESTR_COMP_STRIDE", comp_stride,
364*d7d111ecSJeremy L Thompson                         "RESTR_STRIDE_NODES", strides[0],
365*d7d111ecSJeremy L Thompson                         "RESTR_STRIDE_COMP", strides[1],
366*d7d111ecSJeremy L Thompson                         "RESTR_STRIDE_ELEM", strides[2]); CeedChkBackend(ierr);
367437930d1SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "StridedNoTranspose",
368437930d1SJeremy L Thompson                           &impl->StridedNoTranspose); CeedChkBackend(ierr);
369437930d1SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "OffsetNoTranspose",
370437930d1SJeremy L Thompson                           &impl->OffsetNoTranspose); CeedChkBackend(ierr);
371437930d1SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "StridedTranspose",
372437930d1SJeremy L Thompson                           &impl->StridedTranspose); CeedChkBackend(ierr);
373437930d1SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "OffsetTranspose",
374437930d1SJeremy L Thompson                           &impl->OffsetTranspose); CeedChkBackend(ierr);
375437930d1SJeremy L Thompson   ierr = CeedFree(&restriction_kernel_path); CeedChkBackend(ierr);
376437930d1SJeremy L Thompson   ierr = CeedFree(&restriction_kernel_source); CeedChkBackend(ierr);
3770d0321e0SJeremy L Thompson 
3780d0321e0SJeremy L Thompson   // Register backend functions
3790d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply",
3800d0321e0SJeremy L Thompson                                 CeedElemRestrictionApply_Hip);
3810d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
3820d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock",
3830d0321e0SJeremy L Thompson                                 CeedElemRestrictionApplyBlock_Hip);
3840d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
3850d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets",
3860d0321e0SJeremy L Thompson                                 CeedElemRestrictionGetOffsets_Hip);
3870d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
3880d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy",
3890d0321e0SJeremy L Thompson                                 CeedElemRestrictionDestroy_Hip);
3900d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
3910d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
3920d0321e0SJeremy L Thompson }
3930d0321e0SJeremy L Thompson 
3940d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
3950d0321e0SJeremy L Thompson // Blocked not supported
3960d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
3970d0321e0SJeremy L Thompson int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype,
3980d0321e0SJeremy L Thompson     const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) {
3990d0321e0SJeremy L Thompson   int ierr;
4000d0321e0SJeremy L Thompson   Ceed ceed;
4010d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
4020d0321e0SJeremy L Thompson   return CeedError(ceed, CEED_ERROR_BACKEND,
4030d0321e0SJeremy L Thompson                    "Backend does not implement blocked restrictions");
4040d0321e0SJeremy L Thompson }
4050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
406