xref: /libCEED/backends/hip-ref/ceed-hip-ref-restriction.c (revision 0d0321e0e600f17fbb9528732fcb5c1d5c63fc0f)
1*0d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2*0d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3*0d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details.
4*0d0321e0SJeremy L Thompson //
5*0d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software
6*0d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral
7*0d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and
8*0d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed.
9*0d0321e0SJeremy L Thompson //
10*0d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11*0d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office
12*0d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for
13*0d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including
14*0d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early
15*0d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative.
16*0d0321e0SJeremy L Thompson 
17*0d0321e0SJeremy L Thompson #include <ceed/ceed.h>
18*0d0321e0SJeremy L Thompson #include <ceed/backend.h>
19*0d0321e0SJeremy L Thompson #include <hip/hip_runtime.h>
20*0d0321e0SJeremy L Thompson #include <stdbool.h>
21*0d0321e0SJeremy L Thompson #include <stddef.h>
22*0d0321e0SJeremy L Thompson #include "ceed-hip-ref.h"
23*0d0321e0SJeremy L Thompson #include "../hip/ceed-hip-compile.h"
24*0d0321e0SJeremy L Thompson 
25*0d0321e0SJeremy L Thompson // *INDENT-OFF*
26*0d0321e0SJeremy L Thompson static const char *restrictionkernels = QUOTE(
27*0d0321e0SJeremy L Thompson 
28*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
29*0d0321e0SJeremy L Thompson // L-vector -> E-vector, strided
30*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
31*0d0321e0SJeremy L Thompson extern "C" __global__ void noTrStrided(const CeedInt nelem,
32*0d0321e0SJeremy L Thompson                                        const CeedScalar *__restrict__ u,
33*0d0321e0SJeremy L Thompson                                        CeedScalar *__restrict__ v) {
34*0d0321e0SJeremy L Thompson   for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x;
35*0d0321e0SJeremy L Thompson       node < nelem*RESTRICTION_ELEMSIZE;
36*0d0321e0SJeremy L Thompson       node += blockDim.x * gridDim.x) {
37*0d0321e0SJeremy L Thompson     const CeedInt locNode = node % RESTRICTION_ELEMSIZE;
38*0d0321e0SJeremy L Thompson     const CeedInt elem = node / RESTRICTION_ELEMSIZE;
39*0d0321e0SJeremy L Thompson 
40*0d0321e0SJeremy L Thompson     for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp)
41*0d0321e0SJeremy L Thompson       v[locNode + comp*RESTRICTION_ELEMSIZE*RESTRICTION_NELEM +
42*0d0321e0SJeremy L Thompson         elem*RESTRICTION_ELEMSIZE] =
43*0d0321e0SJeremy L Thompson           u[locNode*STRIDE_NODES + comp*STRIDE_COMP + elem*STRIDE_ELEM];
44*0d0321e0SJeremy L Thompson   }
45*0d0321e0SJeremy L Thompson }
46*0d0321e0SJeremy L Thompson 
47*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
48*0d0321e0SJeremy L Thompson // L-vector -> E-vector, offsets provided
49*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
50*0d0321e0SJeremy L Thompson extern "C" __global__ void noTrOffset(const CeedInt nelem,
51*0d0321e0SJeremy L Thompson                                       const CeedInt *__restrict__ indices,
52*0d0321e0SJeremy L Thompson                                       const CeedScalar *__restrict__ u,
53*0d0321e0SJeremy L Thompson                                       CeedScalar *__restrict__ v) {
54*0d0321e0SJeremy L Thompson   for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x;
55*0d0321e0SJeremy L Thompson       node < nelem*RESTRICTION_ELEMSIZE;
56*0d0321e0SJeremy L Thompson       node += blockDim.x * gridDim.x) {
57*0d0321e0SJeremy L Thompson     const CeedInt ind = indices[node];
58*0d0321e0SJeremy L Thompson     const CeedInt locNode = node % RESTRICTION_ELEMSIZE;
59*0d0321e0SJeremy L Thompson     const CeedInt elem = node / RESTRICTION_ELEMSIZE;
60*0d0321e0SJeremy L Thompson 
61*0d0321e0SJeremy L Thompson     for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp)
62*0d0321e0SJeremy L Thompson       v[locNode + comp*RESTRICTION_ELEMSIZE*RESTRICTION_NELEM +
63*0d0321e0SJeremy L Thompson         elem*RESTRICTION_ELEMSIZE] =
64*0d0321e0SJeremy L Thompson           u[ind + comp*RESTRICTION_COMPSTRIDE];
65*0d0321e0SJeremy L Thompson   }
66*0d0321e0SJeremy L Thompson }
67*0d0321e0SJeremy L Thompson 
68*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
69*0d0321e0SJeremy L Thompson // E-vector -> L-vector, strided
70*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
71*0d0321e0SJeremy L Thompson extern "C" __global__ void trStrided(const CeedInt nelem,
72*0d0321e0SJeremy L Thompson     const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
73*0d0321e0SJeremy L Thompson   for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x;
74*0d0321e0SJeremy L Thompson       node < nelem*RESTRICTION_ELEMSIZE;
75*0d0321e0SJeremy L Thompson       node += blockDim.x * gridDim.x) {
76*0d0321e0SJeremy L Thompson     const CeedInt locNode = node % RESTRICTION_ELEMSIZE;
77*0d0321e0SJeremy L Thompson     const CeedInt elem = node / RESTRICTION_ELEMSIZE;
78*0d0321e0SJeremy L Thompson 
79*0d0321e0SJeremy L Thompson     for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp)
80*0d0321e0SJeremy L Thompson       v[locNode*STRIDE_NODES + comp*STRIDE_COMP + elem*STRIDE_ELEM] +=
81*0d0321e0SJeremy L Thompson           u[locNode + comp*RESTRICTION_ELEMSIZE*RESTRICTION_NELEM +
82*0d0321e0SJeremy L Thompson             elem*RESTRICTION_ELEMSIZE];
83*0d0321e0SJeremy L Thompson   }
84*0d0321e0SJeremy L Thompson }
85*0d0321e0SJeremy L Thompson 
86*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
87*0d0321e0SJeremy L Thompson // E-vector -> L-vector, offsets provided
88*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
89*0d0321e0SJeremy L Thompson extern "C" __global__ void trOffset(const CeedInt *__restrict__ lvec_indices,
90*0d0321e0SJeremy L Thompson                                     const CeedInt *__restrict__ tindices,
91*0d0321e0SJeremy L Thompson                                     const CeedInt *__restrict__ toffsets,
92*0d0321e0SJeremy L Thompson                                     const CeedScalar *__restrict__ u,
93*0d0321e0SJeremy L Thompson                                     CeedScalar *__restrict__ v) {
94*0d0321e0SJeremy L Thompson   CeedScalar value[RESTRICTION_NCOMP];
95*0d0321e0SJeremy L Thompson 
96*0d0321e0SJeremy L Thompson   for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x;
97*0d0321e0SJeremy L Thompson        i < RESTRICTION_NNODES;
98*0d0321e0SJeremy L Thompson        i += blockDim.x * gridDim.x) {
99*0d0321e0SJeremy L Thompson     const CeedInt ind = lvec_indices[i];
100*0d0321e0SJeremy L Thompson     const CeedInt rng1 = toffsets[i];
101*0d0321e0SJeremy L Thompson     const CeedInt rngN = toffsets[i+1];
102*0d0321e0SJeremy L Thompson 
103*0d0321e0SJeremy L Thompson     for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp)
104*0d0321e0SJeremy L Thompson       value[comp] = 0.0;
105*0d0321e0SJeremy L Thompson 
106*0d0321e0SJeremy L Thompson     for (CeedInt j = rng1; j < rngN; ++j) {
107*0d0321e0SJeremy L Thompson       const CeedInt tind = tindices[j];
108*0d0321e0SJeremy L Thompson       CeedInt locNode = tind % RESTRICTION_ELEMSIZE;
109*0d0321e0SJeremy L Thompson       CeedInt elem = tind / RESTRICTION_ELEMSIZE;
110*0d0321e0SJeremy L Thompson 
111*0d0321e0SJeremy L Thompson       for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp)
112*0d0321e0SJeremy L Thompson         value[comp] += u[locNode + comp*RESTRICTION_ELEMSIZE*RESTRICTION_NELEM +
113*0d0321e0SJeremy L Thompson                          elem*RESTRICTION_ELEMSIZE];
114*0d0321e0SJeremy L Thompson     }
115*0d0321e0SJeremy L Thompson 
116*0d0321e0SJeremy L Thompson     for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp)
117*0d0321e0SJeremy L Thompson       v[ind + comp*RESTRICTION_COMPSTRIDE] += value[comp];
118*0d0321e0SJeremy L Thompson   }
119*0d0321e0SJeremy L Thompson }
120*0d0321e0SJeremy L Thompson 
121*0d0321e0SJeremy L Thompson );
122*0d0321e0SJeremy L Thompson // *INDENT-ON*
123*0d0321e0SJeremy L Thompson 
124*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
125*0d0321e0SJeremy L Thompson // Apply restriction
126*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
127*0d0321e0SJeremy L Thompson static int CeedElemRestrictionApply_Hip(CeedElemRestriction r,
128*0d0321e0SJeremy L Thompson                                         CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) {
129*0d0321e0SJeremy L Thompson   int ierr;
130*0d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
131*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
132*0d0321e0SJeremy L Thompson   Ceed ceed;
133*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
134*0d0321e0SJeremy L Thompson   Ceed_Hip *data;
135*0d0321e0SJeremy L Thompson   ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr);
136*0d0321e0SJeremy L Thompson   const CeedInt blocksize = 64;
137*0d0321e0SJeremy L Thompson   const CeedInt nnodes = impl->nnodes;
138*0d0321e0SJeremy L Thompson   CeedInt nelem, elemsize;
139*0d0321e0SJeremy L Thompson   CeedElemRestrictionGetNumElements(r, &nelem);
140*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetElementSize(r, &elemsize); CeedChkBackend(ierr);
141*0d0321e0SJeremy L Thompson   hipFunction_t kernel;
142*0d0321e0SJeremy L Thompson 
143*0d0321e0SJeremy L Thompson   // Get vectors
144*0d0321e0SJeremy L Thompson   const CeedScalar *d_u;
145*0d0321e0SJeremy L Thompson   CeedScalar *d_v;
146*0d0321e0SJeremy L Thompson   ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
147*0d0321e0SJeremy L Thompson   if (tmode == CEED_TRANSPOSE) {
148*0d0321e0SJeremy L Thompson     // Sum into for transpose mode, e-vec to l-vec
149*0d0321e0SJeremy L Thompson     ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
150*0d0321e0SJeremy L Thompson   } else {
151*0d0321e0SJeremy L Thompson     // Overwrite for notranspose mode, l-vec to e-vec
152*0d0321e0SJeremy L Thompson     ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
153*0d0321e0SJeremy L Thompson   }
154*0d0321e0SJeremy L Thompson 
155*0d0321e0SJeremy L Thompson   // Restrict
156*0d0321e0SJeremy L Thompson   if (tmode == CEED_NOTRANSPOSE) {
157*0d0321e0SJeremy L Thompson     // L-vector -> E-vector
158*0d0321e0SJeremy L Thompson     if (impl->d_ind) {
159*0d0321e0SJeremy L Thompson       // -- Offsets provided
160*0d0321e0SJeremy L Thompson       kernel = impl->noTrOffset;
161*0d0321e0SJeremy L Thompson       void *args[] = {&nelem, &impl->d_ind, &d_u, &d_v};
162*0d0321e0SJeremy L Thompson       CeedInt blocksize = elemsize<256?(elemsize>64?elemsize:64):256;
163*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(nnodes, blocksize),
164*0d0321e0SJeremy L Thompson                               blocksize, args); CeedChkBackend(ierr);
165*0d0321e0SJeremy L Thompson     } else {
166*0d0321e0SJeremy L Thompson       // -- Strided restriction
167*0d0321e0SJeremy L Thompson       kernel = impl->noTrStrided;
168*0d0321e0SJeremy L Thompson       void *args[] = {&nelem, &d_u, &d_v};
169*0d0321e0SJeremy L Thompson       CeedInt blocksize = elemsize<256?(elemsize>64?elemsize:64):256;
170*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(nnodes, blocksize),
171*0d0321e0SJeremy L Thompson                               blocksize, args); CeedChkBackend(ierr);
172*0d0321e0SJeremy L Thompson     }
173*0d0321e0SJeremy L Thompson   } else {
174*0d0321e0SJeremy L Thompson     // E-vector -> L-vector
175*0d0321e0SJeremy L Thompson     if (impl->d_ind) {
176*0d0321e0SJeremy L Thompson       // -- Offsets provided
177*0d0321e0SJeremy L Thompson       kernel = impl->trOffset;
178*0d0321e0SJeremy L Thompson       void *args[] = {&impl->d_lvec_indices, &impl->d_tindices,
179*0d0321e0SJeremy L Thompson                       &impl->d_toffsets, &d_u, &d_v
180*0d0321e0SJeremy L Thompson                      };
181*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(nnodes, blocksize),
182*0d0321e0SJeremy L Thompson                               blocksize, args); CeedChkBackend(ierr);
183*0d0321e0SJeremy L Thompson     } else {
184*0d0321e0SJeremy L Thompson       // -- Strided restriction
185*0d0321e0SJeremy L Thompson       kernel = impl->trStrided;
186*0d0321e0SJeremy L Thompson       void *args[] = {&nelem, &d_u, &d_v};
187*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(nnodes, blocksize),
188*0d0321e0SJeremy L Thompson                               blocksize, args); CeedChkBackend(ierr);
189*0d0321e0SJeremy L Thompson     }
190*0d0321e0SJeremy L Thompson   }
191*0d0321e0SJeremy L Thompson 
192*0d0321e0SJeremy L Thompson   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
193*0d0321e0SJeremy L Thompson     *request = NULL;
194*0d0321e0SJeremy L Thompson 
195*0d0321e0SJeremy L Thompson   // Restore arrays
196*0d0321e0SJeremy L Thompson   ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
197*0d0321e0SJeremy L Thompson   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
198*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
199*0d0321e0SJeremy L Thompson }
200*0d0321e0SJeremy L Thompson 
201*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
202*0d0321e0SJeremy L Thompson // Blocked not supported
203*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
204*0d0321e0SJeremy L Thompson int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block,
205*0d0321e0SJeremy L Thompson                                       CeedTransposeMode tmode, CeedVector u,
206*0d0321e0SJeremy L Thompson                                       CeedVector v, CeedRequest *request) {
207*0d0321e0SJeremy L Thompson   // LCOV_EXCL_START
208*0d0321e0SJeremy L Thompson   int ierr;
209*0d0321e0SJeremy L Thompson   Ceed ceed;
210*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
211*0d0321e0SJeremy L Thompson   return CeedError(ceed, CEED_ERROR_BACKEND,
212*0d0321e0SJeremy L Thompson                    "Backend does not implement blocked restrictions");
213*0d0321e0SJeremy L Thompson   // LCOV_EXCL_STOP
214*0d0321e0SJeremy L Thompson }
215*0d0321e0SJeremy L Thompson 
216*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
217*0d0321e0SJeremy L Thompson // Get offsets
218*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
219*0d0321e0SJeremy L Thompson static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr,
220*0d0321e0SJeremy L Thompson     CeedMemType mtype, const CeedInt **offsets) {
221*0d0321e0SJeremy L Thompson   int ierr;
222*0d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
223*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr);
224*0d0321e0SJeremy L Thompson 
225*0d0321e0SJeremy L Thompson   switch (mtype) {
226*0d0321e0SJeremy L Thompson   case CEED_MEM_HOST:
227*0d0321e0SJeremy L Thompson     *offsets = impl->h_ind;
228*0d0321e0SJeremy L Thompson     break;
229*0d0321e0SJeremy L Thompson   case CEED_MEM_DEVICE:
230*0d0321e0SJeremy L Thompson     *offsets = impl->d_ind;
231*0d0321e0SJeremy L Thompson     break;
232*0d0321e0SJeremy L Thompson   }
233*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
234*0d0321e0SJeremy L Thompson }
235*0d0321e0SJeremy L Thompson 
236*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
237*0d0321e0SJeremy L Thompson // Destroy restriction
238*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
239*0d0321e0SJeremy L Thompson static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) {
240*0d0321e0SJeremy L Thompson   int ierr;
241*0d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
242*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
243*0d0321e0SJeremy L Thompson 
244*0d0321e0SJeremy L Thompson   Ceed ceed;
245*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
246*0d0321e0SJeremy L Thompson   ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr);
247*0d0321e0SJeremy L Thompson   ierr = CeedFree(&impl->h_ind_allocated); CeedChkBackend(ierr);
248*0d0321e0SJeremy L Thompson   ierr = hipFree(impl->d_ind_allocated); CeedChk_Hip(ceed, ierr);
249*0d0321e0SJeremy L Thompson   ierr = hipFree(impl->d_toffsets); CeedChk_Hip(ceed, ierr);
250*0d0321e0SJeremy L Thompson   ierr = hipFree(impl->d_tindices); CeedChk_Hip(ceed, ierr);
251*0d0321e0SJeremy L Thompson   ierr = hipFree(impl->d_lvec_indices); CeedChk_Hip(ceed, ierr);
252*0d0321e0SJeremy L Thompson 
253*0d0321e0SJeremy L Thompson   ierr = CeedFree(&impl); CeedChkBackend(ierr);
254*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
255*0d0321e0SJeremy L Thompson }
256*0d0321e0SJeremy L Thompson 
257*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
258*0d0321e0SJeremy L Thompson // Create transpose offsets and indices
259*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
260*0d0321e0SJeremy L Thompson static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r,
261*0d0321e0SJeremy L Thompson     const CeedInt *indices) {
262*0d0321e0SJeremy L Thompson   int ierr;
263*0d0321e0SJeremy L Thompson   Ceed ceed;
264*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
265*0d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
266*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr);
267*0d0321e0SJeremy L Thompson   CeedInt nelem, elemsize, lsize, ncomp;
268*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetNumElements(r, &nelem); CeedChkBackend(ierr);
269*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetElementSize(r, &elemsize); CeedChkBackend(ierr);
270*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetLVectorSize(r, &lsize); CeedChkBackend(ierr);
271*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetNumComponents(r, &ncomp); CeedChkBackend(ierr);
272*0d0321e0SJeremy L Thompson 
273*0d0321e0SJeremy L Thompson   // Count nnodes
274*0d0321e0SJeremy L Thompson   bool *isNode;
275*0d0321e0SJeremy L Thompson   ierr = CeedCalloc(lsize, &isNode); CeedChkBackend(ierr);
276*0d0321e0SJeremy L Thompson   const CeedInt sizeIndices = nelem * elemsize;
277*0d0321e0SJeremy L Thompson   for (CeedInt i = 0; i < sizeIndices; i++)
278*0d0321e0SJeremy L Thompson     isNode[indices[i]] = 1;
279*0d0321e0SJeremy L Thompson   CeedInt nnodes = 0;
280*0d0321e0SJeremy L Thompson   for (CeedInt i = 0; i < lsize; i++)
281*0d0321e0SJeremy L Thompson     nnodes += isNode[i];
282*0d0321e0SJeremy L Thompson   impl->nnodes = nnodes;
283*0d0321e0SJeremy L Thompson 
284*0d0321e0SJeremy L Thompson   // L-vector offsets array
285*0d0321e0SJeremy L Thompson   CeedInt *ind_to_offset, *lvec_indices;
286*0d0321e0SJeremy L Thompson   ierr = CeedCalloc(lsize, &ind_to_offset); CeedChkBackend(ierr);
287*0d0321e0SJeremy L Thompson   ierr = CeedCalloc(nnodes, &lvec_indices); CeedChkBackend(ierr);
288*0d0321e0SJeremy L Thompson   CeedInt j = 0;
289*0d0321e0SJeremy L Thompson   for (CeedInt i = 0; i < lsize; i++)
290*0d0321e0SJeremy L Thompson     if (isNode[i]) {
291*0d0321e0SJeremy L Thompson       lvec_indices[j] = i;
292*0d0321e0SJeremy L Thompson       ind_to_offset[i] = j++;
293*0d0321e0SJeremy L Thompson     }
294*0d0321e0SJeremy L Thompson   ierr = CeedFree(&isNode); CeedChkBackend(ierr);
295*0d0321e0SJeremy L Thompson 
296*0d0321e0SJeremy L Thompson   // Compute transpose offsets and indices
297*0d0321e0SJeremy L Thompson   const CeedInt sizeOffsets = nnodes + 1;
298*0d0321e0SJeremy L Thompson   CeedInt *toffsets;
299*0d0321e0SJeremy L Thompson   ierr = CeedCalloc(sizeOffsets, &toffsets); CeedChkBackend(ierr);
300*0d0321e0SJeremy L Thompson   CeedInt *tindices;
301*0d0321e0SJeremy L Thompson   ierr = CeedMalloc(sizeIndices, &tindices); CeedChkBackend(ierr);
302*0d0321e0SJeremy L Thompson   // Count node multiplicity
303*0d0321e0SJeremy L Thompson   for (CeedInt e = 0; e < nelem; ++e)
304*0d0321e0SJeremy L Thompson     for (CeedInt i = 0; i < elemsize; ++i)
305*0d0321e0SJeremy L Thompson       ++toffsets[ind_to_offset[indices[elemsize*e + i]] + 1];
306*0d0321e0SJeremy L Thompson   // Convert to running sum
307*0d0321e0SJeremy L Thompson   for (CeedInt i = 1; i < sizeOffsets; ++i)
308*0d0321e0SJeremy L Thompson     toffsets[i] += toffsets[i-1];
309*0d0321e0SJeremy L Thompson   // List all E-vec indices associated with L-vec node
310*0d0321e0SJeremy L Thompson   for (CeedInt e = 0; e < nelem; ++e) {
311*0d0321e0SJeremy L Thompson     for (CeedInt i = 0; i < elemsize; ++i) {
312*0d0321e0SJeremy L Thompson       const CeedInt lid = elemsize*e + i;
313*0d0321e0SJeremy L Thompson       const CeedInt gid = indices[lid];
314*0d0321e0SJeremy L Thompson       tindices[toffsets[ind_to_offset[gid]]++] = lid;
315*0d0321e0SJeremy L Thompson     }
316*0d0321e0SJeremy L Thompson   }
317*0d0321e0SJeremy L Thompson   // Reset running sum
318*0d0321e0SJeremy L Thompson   for (int i = sizeOffsets - 1; i > 0; --i)
319*0d0321e0SJeremy L Thompson     toffsets[i] = toffsets[i - 1];
320*0d0321e0SJeremy L Thompson   toffsets[0] = 0;
321*0d0321e0SJeremy L Thompson 
322*0d0321e0SJeremy L Thompson   // Copy data to device
323*0d0321e0SJeremy L Thompson   // -- L-vector indices
324*0d0321e0SJeremy L Thompson   ierr = hipMalloc((void **)&impl->d_lvec_indices, nnodes*sizeof(CeedInt));
325*0d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
326*0d0321e0SJeremy L Thompson   ierr = hipMemcpy(impl->d_lvec_indices, lvec_indices,
327*0d0321e0SJeremy L Thompson                    nnodes*sizeof(CeedInt), hipMemcpyHostToDevice);
328*0d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
329*0d0321e0SJeremy L Thompson   // -- Transpose offsets
330*0d0321e0SJeremy L Thompson   ierr = hipMalloc((void **)&impl->d_toffsets, sizeOffsets*sizeof(CeedInt));
331*0d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
332*0d0321e0SJeremy L Thompson   ierr = hipMemcpy(impl->d_toffsets, toffsets, sizeOffsets*sizeof(CeedInt),
333*0d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
334*0d0321e0SJeremy L Thompson   // -- Transpose indices
335*0d0321e0SJeremy L Thompson   ierr = hipMalloc((void **)&impl->d_tindices, sizeIndices*sizeof(CeedInt));
336*0d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
337*0d0321e0SJeremy L Thompson   ierr = hipMemcpy(impl->d_tindices, tindices, sizeIndices*sizeof(CeedInt),
338*0d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
339*0d0321e0SJeremy L Thompson 
340*0d0321e0SJeremy L Thompson   // Cleanup
341*0d0321e0SJeremy L Thompson   ierr = CeedFree(&ind_to_offset); CeedChkBackend(ierr);
342*0d0321e0SJeremy L Thompson   ierr = CeedFree(&lvec_indices); CeedChkBackend(ierr);
343*0d0321e0SJeremy L Thompson   ierr = CeedFree(&toffsets); CeedChkBackend(ierr);
344*0d0321e0SJeremy L Thompson   ierr = CeedFree(&tindices); CeedChkBackend(ierr);
345*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
346*0d0321e0SJeremy L Thompson }
347*0d0321e0SJeremy L Thompson 
348*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
349*0d0321e0SJeremy L Thompson // Create restriction
350*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
351*0d0321e0SJeremy L Thompson int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode,
352*0d0321e0SJeremy L Thompson                                   const CeedInt *indices,
353*0d0321e0SJeremy L Thompson                                   CeedElemRestriction r) {
354*0d0321e0SJeremy L Thompson   int ierr;
355*0d0321e0SJeremy L Thompson   Ceed ceed;
356*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
357*0d0321e0SJeremy L Thompson   CeedElemRestriction_Hip *impl;
358*0d0321e0SJeremy L Thompson   ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr);
359*0d0321e0SJeremy L Thompson   CeedInt nelem, ncomp, elemsize;
360*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetNumElements(r, &nelem); CeedChkBackend(ierr);
361*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetNumComponents(r, &ncomp); CeedChkBackend(ierr);
362*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetElementSize(r, &elemsize); CeedChkBackend(ierr);
363*0d0321e0SJeremy L Thompson   CeedInt size = nelem * elemsize;
364*0d0321e0SJeremy L Thompson   CeedInt strides[3] = {1, size, elemsize};
365*0d0321e0SJeremy L Thompson   CeedInt compstride = 1;
366*0d0321e0SJeremy L Thompson 
367*0d0321e0SJeremy L Thompson   // Stride data
368*0d0321e0SJeremy L Thompson   bool isStrided;
369*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionIsStrided(r, &isStrided); CeedChkBackend(ierr);
370*0d0321e0SJeremy L Thompson   if (isStrided) {
371*0d0321e0SJeremy L Thompson     bool backendstrides;
372*0d0321e0SJeremy L Thompson     ierr = CeedElemRestrictionHasBackendStrides(r, &backendstrides);
373*0d0321e0SJeremy L Thompson     CeedChkBackend(ierr);
374*0d0321e0SJeremy L Thompson     if (!backendstrides) {
375*0d0321e0SJeremy L Thompson       ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr);
376*0d0321e0SJeremy L Thompson     }
377*0d0321e0SJeremy L Thompson   } else {
378*0d0321e0SJeremy L Thompson     ierr = CeedElemRestrictionGetCompStride(r, &compstride); CeedChkBackend(ierr);
379*0d0321e0SJeremy L Thompson   }
380*0d0321e0SJeremy L Thompson 
381*0d0321e0SJeremy L Thompson   impl->h_ind           = NULL;
382*0d0321e0SJeremy L Thompson   impl->h_ind_allocated = NULL;
383*0d0321e0SJeremy L Thompson   impl->d_ind           = NULL;
384*0d0321e0SJeremy L Thompson   impl->d_ind_allocated = NULL;
385*0d0321e0SJeremy L Thompson   impl->d_tindices      = NULL;
386*0d0321e0SJeremy L Thompson   impl->d_toffsets      = NULL;
387*0d0321e0SJeremy L Thompson   impl->nnodes = size;
388*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr);
389*0d0321e0SJeremy L Thompson   CeedInt layout[3] = {1, elemsize*nelem, elemsize};
390*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr);
391*0d0321e0SJeremy L Thompson 
392*0d0321e0SJeremy L Thompson   // Set up device indices/offset arrays
393*0d0321e0SJeremy L Thompson   if (mtype == CEED_MEM_HOST) {
394*0d0321e0SJeremy L Thompson     switch (cmode) {
395*0d0321e0SJeremy L Thompson     case CEED_OWN_POINTER:
396*0d0321e0SJeremy L Thompson       impl->h_ind_allocated = (CeedInt *)indices;
397*0d0321e0SJeremy L Thompson       impl->h_ind = (CeedInt *)indices;
398*0d0321e0SJeremy L Thompson       break;
399*0d0321e0SJeremy L Thompson     case CEED_USE_POINTER:
400*0d0321e0SJeremy L Thompson       impl->h_ind = (CeedInt *)indices;
401*0d0321e0SJeremy L Thompson       break;
402*0d0321e0SJeremy L Thompson     case CEED_COPY_VALUES:
403*0d0321e0SJeremy L Thompson       break;
404*0d0321e0SJeremy L Thompson     }
405*0d0321e0SJeremy L Thompson     if (indices != NULL) {
406*0d0321e0SJeremy L Thompson       ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt));
407*0d0321e0SJeremy L Thompson       CeedChk_Hip(ceed, ierr);
408*0d0321e0SJeremy L Thompson       impl->d_ind_allocated = impl->d_ind; // We own the device memory
409*0d0321e0SJeremy L Thompson       ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt),
410*0d0321e0SJeremy L Thompson                        hipMemcpyHostToDevice);
411*0d0321e0SJeremy L Thompson       CeedChk_Hip(ceed, ierr);
412*0d0321e0SJeremy L Thompson       ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr);
413*0d0321e0SJeremy L Thompson     }
414*0d0321e0SJeremy L Thompson   } else if (mtype == CEED_MEM_DEVICE) {
415*0d0321e0SJeremy L Thompson     switch (cmode) {
416*0d0321e0SJeremy L Thompson     case CEED_COPY_VALUES:
417*0d0321e0SJeremy L Thompson       if (indices != NULL) {
418*0d0321e0SJeremy L Thompson         ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt));
419*0d0321e0SJeremy L Thompson         CeedChk_Hip(ceed, ierr);
420*0d0321e0SJeremy L Thompson         impl->d_ind_allocated = impl->d_ind; // We own the device memory
421*0d0321e0SJeremy L Thompson         ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt),
422*0d0321e0SJeremy L Thompson                          hipMemcpyDeviceToDevice);
423*0d0321e0SJeremy L Thompson         CeedChk_Hip(ceed, ierr);
424*0d0321e0SJeremy L Thompson       }
425*0d0321e0SJeremy L Thompson       break;
426*0d0321e0SJeremy L Thompson     case CEED_OWN_POINTER:
427*0d0321e0SJeremy L Thompson       impl->d_ind = (CeedInt *)indices;
428*0d0321e0SJeremy L Thompson       impl->d_ind_allocated = impl->d_ind;
429*0d0321e0SJeremy L Thompson       break;
430*0d0321e0SJeremy L Thompson     case CEED_USE_POINTER:
431*0d0321e0SJeremy L Thompson       impl->d_ind = (CeedInt *)indices;
432*0d0321e0SJeremy L Thompson     }
433*0d0321e0SJeremy L Thompson     if (indices != NULL) {
434*0d0321e0SJeremy L Thompson       ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr);
435*0d0321e0SJeremy L Thompson     }
436*0d0321e0SJeremy L Thompson   } else {
437*0d0321e0SJeremy L Thompson     // LCOV_EXCL_START
438*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
439*0d0321e0SJeremy L Thompson                      "Only MemType = HOST or DEVICE supported");
440*0d0321e0SJeremy L Thompson     // LCOV_EXCL_STOP
441*0d0321e0SJeremy L Thompson   }
442*0d0321e0SJeremy L Thompson 
443*0d0321e0SJeremy L Thompson   // Compile HIP kernels
444*0d0321e0SJeremy L Thompson   CeedInt nnodes = impl->nnodes;
445*0d0321e0SJeremy L Thompson   ierr = CeedCompileHip(ceed, restrictionkernels, &impl->module, 8,
446*0d0321e0SJeremy L Thompson                         "RESTRICTION_ELEMSIZE", elemsize,
447*0d0321e0SJeremy L Thompson                         "RESTRICTION_NELEM", nelem,
448*0d0321e0SJeremy L Thompson                         "RESTRICTION_NCOMP", ncomp,
449*0d0321e0SJeremy L Thompson                         "RESTRICTION_NNODES", nnodes,
450*0d0321e0SJeremy L Thompson                         "RESTRICTION_COMPSTRIDE", compstride,
451*0d0321e0SJeremy L Thompson                         "STRIDE_NODES", strides[0],
452*0d0321e0SJeremy L Thompson                         "STRIDE_COMP", strides[1],
453*0d0321e0SJeremy L Thompson                         "STRIDE_ELEM", strides[2]); CeedChkBackend(ierr);
454*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "noTrStrided",
455*0d0321e0SJeremy L Thompson                           &impl->noTrStrided); CeedChkBackend(ierr);
456*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "noTrOffset", &impl->noTrOffset);
457*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
458*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "trStrided", &impl->trStrided);
459*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
460*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, impl->module, "trOffset", &impl->trOffset);
461*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
462*0d0321e0SJeremy L Thompson 
463*0d0321e0SJeremy L Thompson   // Register backend functions
464*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply",
465*0d0321e0SJeremy L Thompson                                 CeedElemRestrictionApply_Hip);
466*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
467*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock",
468*0d0321e0SJeremy L Thompson                                 CeedElemRestrictionApplyBlock_Hip);
469*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
470*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets",
471*0d0321e0SJeremy L Thompson                                 CeedElemRestrictionGetOffsets_Hip);
472*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
473*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy",
474*0d0321e0SJeremy L Thompson                                 CeedElemRestrictionDestroy_Hip);
475*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
476*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
477*0d0321e0SJeremy L Thompson }
478*0d0321e0SJeremy L Thompson 
479*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
480*0d0321e0SJeremy L Thompson // Blocked not supported
481*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
482*0d0321e0SJeremy L Thompson int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype,
483*0d0321e0SJeremy L Thompson     const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) {
484*0d0321e0SJeremy L Thompson   int ierr;
485*0d0321e0SJeremy L Thompson   Ceed ceed;
486*0d0321e0SJeremy L Thompson   ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr);
487*0d0321e0SJeremy L Thompson   return CeedError(ceed, CEED_ERROR_BACKEND,
488*0d0321e0SJeremy L Thompson                    "Backend does not implement blocked restrictions");
489*0d0321e0SJeremy L Thompson }
490*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
491