xref: /libCEED/rust/libceed-sys/c-src/backends/hip-ref/ceed-hip-ref-restriction.c (revision 437930d19388999b5cc2d76e2fe0d14f58fb41f3)
10d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
20d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
30d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details.
40d0321e0SJeremy L Thompson //
50d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software
60d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral
70d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and
80d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed.
90d0321e0SJeremy L Thompson //
100d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
110d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office
120d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for
130d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including
140d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early
150d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative.
160d0321e0SJeremy L Thompson 
170d0321e0SJeremy L Thompson #include <ceed/ceed.h>
180d0321e0SJeremy L Thompson #include <ceed/backend.h>
19*437930d1SJeremy L Thompson #include <ceed/jit-tools.h>
200d0321e0SJeremy L Thompson #include <hip/hip_runtime.h>
210d0321e0SJeremy L Thompson #include <stdbool.h>
220d0321e0SJeremy L Thompson #include <stddef.h>
230d0321e0SJeremy L Thompson #include "ceed-hip-ref.h"
240d0321e0SJeremy L Thompson #include "../hip/ceed-hip-compile.h"
250d0321e0SJeremy L Thompson 
260d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
270d0321e0SJeremy L Thompson // Apply restriction
280d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
290d0321e0SJeremy L Thompson static int CeedElemRestrictionApply_Hip(CeedElemRestriction r,
30*437930d1SJeremy L Thompson                                         CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) {
310d0321e0SJeremy L Thompson   int ierr;
320d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
330d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
340d0321e0SJeremy L Thompson   Ceed ceed;
350d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
360d0321e0SJeremy L Thompson   Ceed_Hip *data;
370d0321e0SJeremy L Thompson   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
38*437930d1SJeremy L Thompson   const CeedInt block_size = 64;
39*437930d1SJeremy L Thompson   const CeedInt num_nodes = impl->num_nodes;
40*437930d1SJeremy L Thompson   CeedInt num_elem, elem_size;
41*437930d1SJeremy L Thompson   CeedElemRestrictionGetNumElements(r, &num_elem);
42*437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
430d0321e0SJeremy L Thompson   hipFunction_t kernel;
440d0321e0SJeremy L Thompson 
450d0321e0SJeremy L Thompson   // Get vectors
460d0321e0SJeremy L Thompson   const CeedScalar *d_u;
470d0321e0SJeremy L Thompson   CeedScalar *d_v;
480d0321e0SJeremy L Thompson   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
49*437930d1SJeremy L Thompson   if (t_mode == CEED_TRANSPOSE) {
500d0321e0SJeremy L Thompson     // Sum into for transpose mode, e-vec to l-vec
510d0321e0SJeremy L Thompson     ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
520d0321e0SJeremy L Thompson   } else {
530d0321e0SJeremy L Thompson     // Overwrite for notranspose mode, l-vec to e-vec
540d0321e0SJeremy L Thompson     ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
550d0321e0SJeremy L Thompson   }
560d0321e0SJeremy L Thompson 
570d0321e0SJeremy L Thompson   // Restrict
58*437930d1SJeremy L Thompson   if (t_mode == CEED_NOTRANSPOSE) {
590d0321e0SJeremy L Thompson     // L-vector -> E-vector
600d0321e0SJeremy L Thompson     if (impl->d_ind) {
610d0321e0SJeremy L Thompson       // -- Offsets provided
62*437930d1SJeremy L Thompson       kernel = impl->OffsetNoTranspose;
63*437930d1SJeremy L Thompson       void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
64*437930d1SJeremy L Thompson       CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256;
65*437930d1SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
66*437930d1SJeremy L Thompson                               block_size, args); CeedChkBackend(ierr);
670d0321e0SJeremy L Thompson     } else {
680d0321e0SJeremy L Thompson       // -- Strided restriction
69*437930d1SJeremy L Thompson       kernel = impl->StridedNoTranspose;
70*437930d1SJeremy L Thompson       void *args[] = {&num_elem, &d_u, &d_v};
71*437930d1SJeremy L Thompson       CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256;
72*437930d1SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
73*437930d1SJeremy L Thompson                               block_size, args); CeedChkBackend(ierr);
740d0321e0SJeremy L Thompson     }
750d0321e0SJeremy L Thompson   } else {
760d0321e0SJeremy L Thompson     // E-vector -> L-vector
770d0321e0SJeremy L Thompson     if (impl->d_ind) {
780d0321e0SJeremy L Thompson       // -- Offsets provided
79*437930d1SJeremy L Thompson       kernel = impl->OffsetTranspose;
80*437930d1SJeremy L Thompson       void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices,
81*437930d1SJeremy L Thompson                       &impl->d_t_offsets, &d_u, &d_v
820d0321e0SJeremy L Thompson                      };
83*437930d1SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
84*437930d1SJeremy L Thompson                               block_size, args); CeedChkBackend(ierr);
850d0321e0SJeremy L Thompson     } else {
860d0321e0SJeremy L Thompson       // -- Strided restriction
87*437930d1SJeremy L Thompson       kernel = impl->StridedTranspose;
88*437930d1SJeremy L Thompson       void *args[] = {&num_elem, &d_u, &d_v};
89*437930d1SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size),
90*437930d1SJeremy L Thompson                               block_size, args); CeedChkBackend(ierr);
910d0321e0SJeremy L Thompson     }
920d0321e0SJeremy L Thompson   }
930d0321e0SJeremy L Thompson 
940d0321e0SJeremy L Thompson   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
950d0321e0SJeremy L Thompson     *request = NULL;
960d0321e0SJeremy L Thompson 
970d0321e0SJeremy L Thompson   // Restore arrays
980d0321e0SJeremy L Thompson   ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
990d0321e0SJeremy L Thompson   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
1000d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
1010d0321e0SJeremy L Thompson }
1020d0321e0SJeremy L Thompson 
1030d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1040d0321e0SJeremy L Thompson // Blocked not supported
1050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1060d0321e0SJeremy L Thompson int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block,
107*437930d1SJeremy L Thompson                                       CeedTransposeMode t_mode, CeedVector u,
1080d0321e0SJeremy L Thompson                                       CeedVector v, CeedRequest *request) {
1090d0321e0SJeremy L Thompson   // LCOV_EXCL_START
1100d0321e0SJeremy L Thompson   int ierr;
1110d0321e0SJeremy L Thompson   Ceed ceed;
1120d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
1130d0321e0SJeremy L Thompson   return CeedError(ceed, CEED_ERROR_BACKEND,
1140d0321e0SJeremy L Thompson                    "Backend does not implement blocked restrictions");
1150d0321e0SJeremy L Thompson   // LCOV_EXCL_STOP
1160d0321e0SJeremy L Thompson }
1170d0321e0SJeremy L Thompson 
1180d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1190d0321e0SJeremy L Thompson // Get offsets
1200d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1210d0321e0SJeremy L Thompson static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr,
1220d0321e0SJeremy L Thompson     CeedMemType mtype, const CeedInt **offsets) {
1230d0321e0SJeremy L Thompson   int ierr;
1240d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
1250d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr);
1260d0321e0SJeremy L Thompson 
1270d0321e0SJeremy L Thompson   switch (mtype) {
1280d0321e0SJeremy L Thompson   case CEED_MEM_HOST:
1290d0321e0SJeremy L Thompson     *offsets = impl->h_ind;
1300d0321e0SJeremy L Thompson     break;
1310d0321e0SJeremy L Thompson   case CEED_MEM_DEVICE:
1320d0321e0SJeremy L Thompson     *offsets = impl->d_ind;
1330d0321e0SJeremy L Thompson     break;
1340d0321e0SJeremy L Thompson   }
1350d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
1360d0321e0SJeremy L Thompson }
1370d0321e0SJeremy L Thompson 
1380d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1390d0321e0SJeremy L Thompson // Destroy restriction
1400d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1410d0321e0SJeremy L Thompson static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) {
1420d0321e0SJeremy L Thompson   int ierr;
1430d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
1440d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
1450d0321e0SJeremy L Thompson 
1460d0321e0SJeremy L Thompson   Ceed ceed;
1470d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
1480d0321e0SJeremy L Thompson   ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr);
1490d0321e0SJeremy L Thompson   ierr = CeedFree(&impl->h_ind_allocated); CeedChkBackend(ierr);
1500d0321e0SJeremy L Thompson   ierr = hipFree(impl->d_ind_allocated); CeedChk_Hip(ceed, ierr);
151*437930d1SJeremy L Thompson   ierr = hipFree(impl->d_t_offsets); CeedChk_Hip(ceed, ierr);
152*437930d1SJeremy L Thompson   ierr = hipFree(impl->d_t_indices); CeedChk_Hip(ceed, ierr);
153*437930d1SJeremy L Thompson   ierr = hipFree(impl->d_l_vec_indices); CeedChk_Hip(ceed, ierr);
1540d0321e0SJeremy L Thompson   ierr = CeedFree(&impl); CeedChkBackend(ierr);
155*437930d1SJeremy L Thompson 
1560d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
1570d0321e0SJeremy L Thompson }
1580d0321e0SJeremy L Thompson 
1590d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1600d0321e0SJeremy L Thompson // Create transpose offsets and indices
1610d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1620d0321e0SJeremy L Thompson static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r,
1630d0321e0SJeremy L Thompson     const CeedInt *indices) {
1640d0321e0SJeremy L Thompson   int ierr;
1650d0321e0SJeremy L Thompson   Ceed ceed;
1660d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
1670d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
1680d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
169*437930d1SJeremy L Thompson   CeedInt num_elem, elem_size, l_size, num_comp;
170*437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr);
171*437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
172*437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr);
173*437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr);
1740d0321e0SJeremy L Thompson 
175*437930d1SJeremy L Thompson   // Count num_nodes
176*437930d1SJeremy L Thompson   bool *is_node;
177*437930d1SJeremy L Thompson   ierr = CeedCalloc(l_size, &is_node); CeedChkBackend(ierr);
178*437930d1SJeremy L Thompson   const CeedInt size_indices = num_elem * elem_size;
179*437930d1SJeremy L Thompson   for (CeedInt i = 0; i < size_indices; i++)
180*437930d1SJeremy L Thompson     is_node[indices[i]] = 1;
181*437930d1SJeremy L Thompson   CeedInt num_nodes = 0;
182*437930d1SJeremy L Thompson   for (CeedInt i = 0; i < l_size; i++)
183*437930d1SJeremy L Thompson     num_nodes += is_node[i];
184*437930d1SJeremy L Thompson   impl->num_nodes = num_nodes;
1850d0321e0SJeremy L Thompson 
1860d0321e0SJeremy L Thompson   // L-vector offsets array
187*437930d1SJeremy L Thompson   CeedInt *ind_to_offset, *l_vec_indices;
188*437930d1SJeremy L Thompson   ierr = CeedCalloc(l_size, &ind_to_offset); CeedChkBackend(ierr);
189*437930d1SJeremy L Thompson   ierr = CeedCalloc(num_nodes, &l_vec_indices); CeedChkBackend(ierr);
1900d0321e0SJeremy L Thompson   CeedInt j = 0;
191*437930d1SJeremy L Thompson   for (CeedInt i = 0; i < l_size; i++)
192*437930d1SJeremy L Thompson     if (is_node[i]) {
193*437930d1SJeremy L Thompson       l_vec_indices[j] = i;
1940d0321e0SJeremy L Thompson       ind_to_offset[i] = j++;
1950d0321e0SJeremy L Thompson     }
196*437930d1SJeremy L Thompson   ierr = CeedFree(&is_node); CeedChkBackend(ierr);
1970d0321e0SJeremy L Thompson 
1980d0321e0SJeremy L Thompson   // Compute transpose offsets and indices
199*437930d1SJeremy L Thompson   const CeedInt size_offsets = num_nodes + 1;
200*437930d1SJeremy L Thompson   CeedInt *t_offsets;
201*437930d1SJeremy L Thompson   ierr = CeedCalloc(size_offsets, &t_offsets); CeedChkBackend(ierr);
202*437930d1SJeremy L Thompson   CeedInt *t_indices;
203*437930d1SJeremy L Thompson   ierr = CeedMalloc(size_indices, &t_indices); CeedChkBackend(ierr);
2040d0321e0SJeremy L Thompson   // Count node multiplicity
205*437930d1SJeremy L Thompson   for (CeedInt e = 0; e < num_elem; ++e)
206*437930d1SJeremy L Thompson     for (CeedInt i = 0; i < elem_size; ++i)
207*437930d1SJeremy L Thompson       ++t_offsets[ind_to_offset[indices[elem_size*e + i]] + 1];
2080d0321e0SJeremy L Thompson   // Convert to running sum
209*437930d1SJeremy L Thompson   for (CeedInt i = 1; i < size_offsets; ++i)
210*437930d1SJeremy L Thompson     t_offsets[i] += t_offsets[i-1];
2110d0321e0SJeremy L Thompson   // List all E-vec indices associated with L-vec node
212*437930d1SJeremy L Thompson   for (CeedInt e = 0; e < num_elem; ++e) {
213*437930d1SJeremy L Thompson     for (CeedInt i = 0; i < elem_size; ++i) {
214*437930d1SJeremy L Thompson       const CeedInt lid = elem_size*e + i;
2150d0321e0SJeremy L Thompson       const CeedInt gid = indices[lid];
216*437930d1SJeremy L Thompson       t_indices[t_offsets[ind_to_offset[gid]]++] = lid;
2170d0321e0SJeremy L Thompson     }
2180d0321e0SJeremy L Thompson   }
2190d0321e0SJeremy L Thompson   // Reset running sum
220*437930d1SJeremy L Thompson   for (int i = size_offsets - 1; i > 0; --i)
221*437930d1SJeremy L Thompson     t_offsets[i] = t_offsets[i - 1];
222*437930d1SJeremy L Thompson   t_offsets[0] = 0;
2230d0321e0SJeremy L Thompson 
2240d0321e0SJeremy L Thompson   // Copy data to device
2250d0321e0SJeremy L Thompson   // -- L-vector indices
226*437930d1SJeremy L Thompson   ierr = hipMalloc((void **)&impl->d_l_vec_indices, num_nodes*sizeof(CeedInt));
2270d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
228*437930d1SJeremy L Thompson   ierr = hipMemcpy(impl->d_l_vec_indices, l_vec_indices,
229*437930d1SJeremy L Thompson                    num_nodes*sizeof(CeedInt), hipMemcpyHostToDevice);
2300d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
2310d0321e0SJeremy L Thompson   // -- Transpose offsets
232*437930d1SJeremy L Thompson   ierr = hipMalloc((void **)&impl->d_t_offsets, size_offsets*sizeof(CeedInt));
2330d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
234*437930d1SJeremy L Thompson   ierr = hipMemcpy(impl->d_t_offsets, t_offsets, size_offsets*sizeof(CeedInt),
2350d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
2360d0321e0SJeremy L Thompson   // -- Transpose indices
237*437930d1SJeremy L Thompson   ierr = hipMalloc((void **)&impl->d_t_indices, size_indices*sizeof(CeedInt));
2380d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
239*437930d1SJeremy L Thompson   ierr = hipMemcpy(impl->d_t_indices, t_indices, size_indices*sizeof(CeedInt),
2400d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
2410d0321e0SJeremy L Thompson 
2420d0321e0SJeremy L Thompson   // Cleanup
2430d0321e0SJeremy L Thompson   ierr = CeedFree(&ind_to_offset); CeedChkBackend(ierr);
244*437930d1SJeremy L Thompson   ierr = CeedFree(&l_vec_indices); CeedChkBackend(ierr);
245*437930d1SJeremy L Thompson   ierr = CeedFree(&t_offsets); CeedChkBackend(ierr);
246*437930d1SJeremy L Thompson   ierr = CeedFree(&t_indices); CeedChkBackend(ierr);
247*437930d1SJeremy L Thompson 
2480d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2490d0321e0SJeremy L Thompson }
2500d0321e0SJeremy L Thompson 
2510d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
2520d0321e0SJeremy L Thompson // Create restriction
2530d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
2540d0321e0SJeremy L Thompson int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode,
2550d0321e0SJeremy L Thompson                                   const CeedInt *indices,
2560d0321e0SJeremy L Thompson                                   CeedElemRestriction r) {
2570d0321e0SJeremy L Thompson   int ierr;
2580d0321e0SJeremy L Thompson   Ceed ceed;
2590d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
2600d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
2610d0321e0SJeremy L Thompson   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
262*437930d1SJeremy L Thompson   CeedInt num_elem, num_comp, elem_size;
263*437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr);
264*437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr);
265*437930d1SJeremy L Thompson   ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr);
266*437930d1SJeremy L Thompson   CeedInt size = num_elem * elem_size;
267*437930d1SJeremy L Thompson   CeedInt strides[3] = {1, size, elem_size};
268*437930d1SJeremy L Thompson   CeedInt comp_stride = 1;
2690d0321e0SJeremy L Thompson 
2700d0321e0SJeremy L Thompson   // Stride data
271*437930d1SJeremy L Thompson   bool is_strided;
272*437930d1SJeremy L Thompson   ierr = CeedElemRestrictionIsStrided(r, &is_strided); CeedChkBackend(ierr);
273*437930d1SJeremy L Thompson   if (is_strided) {
274*437930d1SJeremy L Thompson     bool has_backend_strides;
275*437930d1SJeremy L Thompson     ierr = CeedElemRestrictionHasBackendStrides(r, &has_backend_strides);
2760d0321e0SJeremy L Thompson     CeedChkBackend(ierr);
277*437930d1SJeremy L Thompson     if (!has_backend_strides) {
2780d0321e0SJeremy L Thompson       ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr);
2790d0321e0SJeremy L Thompson     }
2800d0321e0SJeremy L Thompson   } else {
281*437930d1SJeremy L Thompson     ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr);
2820d0321e0SJeremy L Thompson   }
2830d0321e0SJeremy L Thompson 
2840d0321e0SJeremy L Thompson   impl->h_ind           = NULL;
2850d0321e0SJeremy L Thompson   impl->h_ind_allocated = NULL;
2860d0321e0SJeremy L Thompson   impl->d_ind           = NULL;
2870d0321e0SJeremy L Thompson   impl->d_ind_allocated = NULL;
288*437930d1SJeremy L Thompson   impl->d_t_indices     = NULL;
289*437930d1SJeremy L Thompson   impl->d_t_offsets     = NULL;
290*437930d1SJeremy L Thompson   impl->num_nodes = size;
2910d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr);
292*437930d1SJeremy L Thompson   CeedInt layout[3] = {1, elem_size*num_elem, elem_size};
2930d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr);
2940d0321e0SJeremy L Thompson 
2950d0321e0SJeremy L Thompson   // Set up device indices/offset arrays
2960d0321e0SJeremy L Thompson   if (mtype == CEED_MEM_HOST) {
2970d0321e0SJeremy L Thompson     switch (cmode) {
2980d0321e0SJeremy L Thompson     case CEED_OWN_POINTER:
2990d0321e0SJeremy L Thompson       impl->h_ind_allocated = (CeedInt *)indices;
3000d0321e0SJeremy L Thompson       impl->h_ind = (CeedInt *)indices;
3010d0321e0SJeremy L Thompson       break;
3020d0321e0SJeremy L Thompson     case CEED_USE_POINTER:
3030d0321e0SJeremy L Thompson       impl->h_ind = (CeedInt *)indices;
3040d0321e0SJeremy L Thompson       break;
3050d0321e0SJeremy L Thompson     case CEED_COPY_VALUES:
3060d0321e0SJeremy L Thompson       break;
3070d0321e0SJeremy L Thompson     }
3080d0321e0SJeremy L Thompson     if (indices != NULL) {
3090d0321e0SJeremy L Thompson       ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt));
3100d0321e0SJeremy L Thompson       CeedChk_Hip(ceed, ierr);
3110d0321e0SJeremy L Thompson       impl->d_ind_allocated = impl->d_ind; // We own the device memory
3120d0321e0SJeremy L Thompson       ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt),
3130d0321e0SJeremy L Thompson                        hipMemcpyHostToDevice);
3140d0321e0SJeremy L Thompson       CeedChk_Hip(ceed, ierr);
3150d0321e0SJeremy L Thompson       ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr);
3160d0321e0SJeremy L Thompson     }
3170d0321e0SJeremy L Thompson   } else if (mtype == CEED_MEM_DEVICE) {
3180d0321e0SJeremy L Thompson     switch (cmode) {
3190d0321e0SJeremy L Thompson     case CEED_COPY_VALUES:
3200d0321e0SJeremy L Thompson       if (indices != NULL) {
3210d0321e0SJeremy L Thompson         ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt));
3220d0321e0SJeremy L Thompson         CeedChk_Hip(ceed, ierr);
3230d0321e0SJeremy L Thompson         impl->d_ind_allocated = impl->d_ind; // We own the device memory
3240d0321e0SJeremy L Thompson         ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt),
3250d0321e0SJeremy L Thompson                          hipMemcpyDeviceToDevice);
3260d0321e0SJeremy L Thompson         CeedChk_Hip(ceed, ierr);
3270d0321e0SJeremy L Thompson       }
3280d0321e0SJeremy L Thompson       break;
3290d0321e0SJeremy L Thompson     case CEED_OWN_POINTER:
3300d0321e0SJeremy L Thompson       impl->d_ind = (CeedInt *)indices;
3310d0321e0SJeremy L Thompson       impl->d_ind_allocated = impl->d_ind;
3320d0321e0SJeremy L Thompson       break;
3330d0321e0SJeremy L Thompson     case CEED_USE_POINTER:
3340d0321e0SJeremy L Thompson       impl->d_ind = (CeedInt *)indices;
3350d0321e0SJeremy L Thompson     }
3360d0321e0SJeremy L Thompson     if (indices != NULL) {
3370d0321e0SJeremy L Thompson       ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr);
3380d0321e0SJeremy L Thompson     }
3390d0321e0SJeremy L Thompson   } else {
3400d0321e0SJeremy L Thompson     // LCOV_EXCL_START
3410d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
3420d0321e0SJeremy L Thompson                      "Only MemType = HOST or DEVICE supported");
3430d0321e0SJeremy L Thompson     // LCOV_EXCL_STOP
3440d0321e0SJeremy L Thompson   }
3450d0321e0SJeremy L Thompson 
3460d0321e0SJeremy L Thompson   // Compile HIP kernels
347*437930d1SJeremy L Thompson   CeedInt num_nodes = impl->num_nodes;
348*437930d1SJeremy L Thompson   char *restriction_kernel_path, *restriction_kernel_source;
349*437930d1SJeremy L Thompson   ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-restriction.h",
350*437930d1SJeremy L Thompson                              &restriction_kernel_path); CeedChkBackend(ierr);
351*437930d1SJeremy L Thompson   ierr = CeedLoadSourceToBuffer(ceed, restriction_kernel_path,
352*437930d1SJeremy L Thompson                                 &restriction_kernel_source);
353*437930d1SJeremy L Thompson   CeedChkBackend(ierr);
354*437930d1SJeremy L Thompson   ierr = CeedCompileHip(ceed, restriction_kernel_source, &impl->module, 8,
355*437930d1SJeremy L Thompson                         "RESTRICTION_ELEMSIZE", elem_size,
356*437930d1SJeremy L Thompson                         "RESTRICTION_NELEM", num_elem,
357*437930d1SJeremy L Thompson                         "RESTRICTION_NCOMP", num_comp,
358*437930d1SJeremy L Thompson                         "RESTRICTION_NNODES", num_nodes,
359*437930d1SJeremy L Thompson                         "RESTRICTION_COMPSTRIDE", comp_stride,
3600d0321e0SJeremy L Thompson                         "STRIDE_NODES", strides[0],
3610d0321e0SJeremy L Thompson                         "STRIDE_COMP", strides[1],
3620d0321e0SJeremy L Thompson                         "STRIDE_ELEM", strides[2]); CeedChkBackend(ierr);
363*437930d1SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "StridedNoTranspose",
364*437930d1SJeremy L Thompson                           &impl->StridedNoTranspose); CeedChkBackend(ierr);
365*437930d1SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "OffsetNoTranspose",
366*437930d1SJeremy L Thompson                           &impl->OffsetNoTranspose); CeedChkBackend(ierr);
367*437930d1SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "StridedTranspose",
368*437930d1SJeremy L Thompson                           &impl->StridedTranspose); CeedChkBackend(ierr);
369*437930d1SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "OffsetTranspose",
370*437930d1SJeremy L Thompson                           &impl->OffsetTranspose); CeedChkBackend(ierr);
371*437930d1SJeremy L Thompson   ierr = CeedFree(&restriction_kernel_path); CeedChkBackend(ierr);
372*437930d1SJeremy L Thompson   ierr = CeedFree(&restriction_kernel_source); CeedChkBackend(ierr);
3730d0321e0SJeremy L Thompson 
3740d0321e0SJeremy L Thompson   // Register backend functions
3750d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply",
3760d0321e0SJeremy L Thompson                                 CeedElemRestrictionApply_Hip);
3770d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
3780d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock",
3790d0321e0SJeremy L Thompson                                 CeedElemRestrictionApplyBlock_Hip);
3800d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
3810d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets",
3820d0321e0SJeremy L Thompson                                 CeedElemRestrictionGetOffsets_Hip);
3830d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
3840d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy",
3850d0321e0SJeremy L Thompson                                 CeedElemRestrictionDestroy_Hip);
3860d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
3870d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
3880d0321e0SJeremy L Thompson }
3890d0321e0SJeremy L Thompson 
3900d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
3910d0321e0SJeremy L Thompson // Blocked not supported
3920d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
3930d0321e0SJeremy L Thompson int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype,
3940d0321e0SJeremy L Thompson     const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) {
3950d0321e0SJeremy L Thompson   int ierr;
3960d0321e0SJeremy L Thompson   Ceed ceed;
3970d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
3980d0321e0SJeremy L Thompson   return CeedError(ceed, CEED_ERROR_BACKEND,
3990d0321e0SJeremy L Thompson                    "Backend does not implement blocked restrictions");
4000d0321e0SJeremy L Thompson }
4010d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
402