xref: /libCEED/backends/hip-ref/ceed-hip-ref-basis.c (revision d92fedf5b7546cf2fc50391dbcfb657a2e1f0a3b)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed/ceed.h>
18 #include <ceed/backend.h>
19 #include <hip/hip_runtime.h>
20 #include "ceed-hip-ref.h"
21 #include "../hip/ceed-hip-compile.h"
22 
23 //------------------------------------------------------------------------------
24 // Tensor Basis Kernels
25 //------------------------------------------------------------------------------
26 // *INDENT-OFF*
27 static const char *basiskernels = QUOTE(
28 
29 //------------------------------------------------------------------------------
30 // Interp
31 //------------------------------------------------------------------------------
32 extern "C" __global__ void interp(const CeedInt nelem, const int transpose,
33                                   const CeedScalar *__restrict__ interp1d,
34                                   const CeedScalar *__restrict__ u,
35                                   CeedScalar *__restrict__ v) {
36   const CeedInt i = threadIdx.x;
37 
38   __shared__ CeedScalar s_mem[BASIS_Q1D * BASIS_P1D + 2 * BASIS_BUF_LEN];
39   CeedScalar *s_interp1d = s_mem;
40   CeedScalar *s_buf1 = s_mem + BASIS_Q1D * BASIS_P1D;
41   CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN;
42   for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) {
43     s_interp1d[k] = interp1d[k];
44   }
45 
46   const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D;
47   const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D;
48   const CeedInt stride0 = transpose ? 1 : BASIS_P1D;
49   const CeedInt stride1 = transpose ? BASIS_P1D : 1;
50   const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
51   const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT;
52   const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE);
53   const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT);
54   const CeedInt u_size = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
55 
56   // Apply basis element by element
57   for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) {
58     for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) {
59       const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
60       CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
61       for (CeedInt k = i; k < u_size; k += blockDim.x) {
62         s_buf1[k] = cur_u[k];
63       }
64       CeedInt pre = u_size;
65       CeedInt post = 1;
66       for (CeedInt d = 0; d < BASIS_DIM; d++) {
67         __syncthreads();
68         // Update buffers used
69         pre /= P;
70         const CeedScalar *in = d % 2 ? s_buf2 : s_buf1;
71         CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buf1 : s_buf2);
72 
73         // Contract along middle index
74         const CeedInt writeLen = pre * post * Q;
75         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
76           const CeedInt c = k % post;
77           const CeedInt j = (k / post) % Q;
78           const CeedInt a = k / (post * Q);
79 
80           CeedScalar vk = 0;
81           for (CeedInt b = 0; b < P; b++)
82             vk += s_interp1d[j*stride0 + b*stride1] * in[(a*P + b)*post + c];
83 
84           out[k] = vk;
85         }
86 
87         post *= Q;
88       }
89     }
90   }
91 }
92 
93 //------------------------------------------------------------------------------
94 // Grad
95 //------------------------------------------------------------------------------
96 extern "C" __global__ void grad(const CeedInt nelem, const int transpose,
97                                 const CeedScalar *__restrict__ interp1d,
98                                 const CeedScalar *__restrict__ grad1d,
99                                 const CeedScalar *__restrict__ u,
100                                 CeedScalar *__restrict__ v) {
101   const CeedInt i = threadIdx.x;
102 
103   __shared__ CeedScalar s_mem[2 * (BASIS_Q1D * BASIS_P1D + BASIS_BUF_LEN)];
104   CeedScalar *s_interp1d = s_mem;
105   CeedScalar *s_grad1d = s_interp1d + BASIS_Q1D * BASIS_P1D;
106   CeedScalar *s_buf1 = s_grad1d + BASIS_Q1D * BASIS_P1D;
107   CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN;
108   for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) {
109     s_interp1d[k] = interp1d[k];
110     s_grad1d[k] = grad1d[k];
111   }
112 
113   const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D;
114   const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D;
115   const CeedInt stride0 = transpose ? 1 : BASIS_P1D;
116   const CeedInt stride1 = transpose ? BASIS_P1D : 1;
117   const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
118   const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT;
119   const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE);
120   const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT);
121   const CeedInt u_dim_stride = transpose ? nelem * BASIS_NQPT * BASIS_NCOMP : 0;
122   const CeedInt v_dim_stride = transpose ? 0 : nelem * BASIS_NQPT * BASIS_NCOMP;
123 
124   // Apply basis element by element
125   for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) {
126     for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) {
127 
128       // dim*dim contractions for grad
129       for (CeedInt dim1 = 0; dim1 < BASIS_DIM; dim1++) {
130         CeedInt pre = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
131         CeedInt post = 1;
132         const CeedScalar *cur_u = u + elem * u_stride + dim1 * u_dim_stride +
133                                   comp * u_comp_stride;
134         CeedScalar *cur_v = v + elem * v_stride + dim1 * v_dim_stride + comp *
135                             v_comp_stride;
136         for (CeedInt dim2 = 0; dim2 < BASIS_DIM; dim2++) {
137           __syncthreads();
138           // Update buffers used
139           pre /= P;
140           const CeedScalar *op = dim1 == dim2 ? s_grad1d : s_interp1d;
141           const CeedScalar *in = dim2 == 0 ? cur_u : (dim2 % 2 ? s_buf2 : s_buf1);
142           CeedScalar *out = dim2 == BASIS_DIM - 1 ? cur_v : (dim2 % 2 ? s_buf1 : s_buf2);
143 
144           // Contract along middle index
145           const CeedInt writeLen = pre * post * Q;
146           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
147             const CeedInt c = k % post;
148             const CeedInt j = (k / post) % Q;
149             const CeedInt a = k / (post * Q);
150             CeedScalar vk = 0;
151             for (CeedInt b = 0; b < P; b++)
152               vk += op[j * stride0 + b * stride1] * in[(a * P + b) * post + c];
153 
154             if (transpose && dim2 == BASIS_DIM - 1)
155               out[k] += vk;
156             else
157               out[k] = vk;
158           }
159 
160           post *= Q;
161         }
162       }
163     }
164   }
165 }
166 
167 //------------------------------------------------------------------------------
168 // 1D quadrature weights
169 //------------------------------------------------------------------------------
170 __device__ void weight1d(const CeedInt nelem, const CeedScalar *qweight1d,
171                          CeedScalar *w) {
172   CeedScalar w1d[BASIS_Q1D];
173   for (int i = 0; i < BASIS_Q1D; ++i)
174     w1d[i] = qweight1d[i];
175 
176   for (int e = blockIdx.x * blockDim.x + threadIdx.x;
177        e < nelem;
178        e += blockDim.x * gridDim.x)
179     for (int i = 0; i < BASIS_Q1D; ++i) {
180       const int ind = e*BASIS_Q1D + i; // sequential
181       w[ind] = w1d[i];
182     }
183 }
184 
185 //------------------------------------------------------------------------------
186 // 2D quadrature weights
187 //------------------------------------------------------------------------------
188 __device__ void weight2d(const CeedInt nelem, const CeedScalar *qweight1d,
189                          CeedScalar *w) {
190   CeedScalar w1d[BASIS_Q1D];
191   for (int i = 0; i < BASIS_Q1D; ++i)
192     w1d[i] = qweight1d[i];
193 
194   for (int e = blockIdx.x * blockDim.x + threadIdx.x;
195        e < nelem;
196        e += blockDim.x * gridDim.x)
197     for (int i = 0; i < BASIS_Q1D; ++i)
198       for (int j = 0; j < BASIS_Q1D; ++j) {
199         const int ind = e*BASIS_Q1D*BASIS_Q1D + i + j*BASIS_Q1D; // sequential
200         w[ind] = w1d[i]*w1d[j];
201       }
202 }
203 
204 //------------------------------------------------------------------------------
205 // 3D quadrature weights
206 //------------------------------------------------------------------------------
207 __device__ void weight3d(const CeedInt nelem, const CeedScalar *qweight1d,
208                          CeedScalar *w) {
209   CeedScalar w1d[BASIS_Q1D];
210   for (int i = 0; i < BASIS_Q1D; ++i)
211     w1d[i] = qweight1d[i];
212 
213   for (int e = blockIdx.x * blockDim.x + threadIdx.x;
214        e < nelem;
215        e += blockDim.x * gridDim.x)
216     for (int i = 0; i < BASIS_Q1D; ++i)
217       for (int j = 0; j < BASIS_Q1D; ++j)
218         for (int k = 0; k < BASIS_Q1D; ++k) {
219           const int ind = e*BASIS_Q1D*BASIS_Q1D*BASIS_Q1D + i + j*BASIS_Q1D +
220                           k*BASIS_Q1D*BASIS_Q1D; // sequential
221           w[ind] = w1d[i]*w1d[j]*w1d[k];
222         }
223 }
224 
225 //------------------------------------------------------------------------------
226 // Quadrature weights
227 //------------------------------------------------------------------------------
228 extern "C" __global__ void weight(const CeedInt nelem,
229                                   const CeedScalar *__restrict__ qweight1d,
230                                   CeedScalar *__restrict__ v) {
231   if (BASIS_DIM==1)
232     weight1d(nelem, qweight1d, v);
233   else if (BASIS_DIM==2)
234     weight2d(nelem, qweight1d, v);
235   else if (BASIS_DIM==3)
236     weight3d(nelem, qweight1d, v);
237 }
238 
239 );
240 
241 //------------------------------------------------------------------------------
242 // Non-Tensor Basis Kernels
243 //------------------------------------------------------------------------------
244 static const char *kernelsNonTensorRef = QUOTE(
245 
246 //------------------------------------------------------------------------------
247 // Interp
248 //------------------------------------------------------------------------------
249 extern "C" __global__ void interp(const CeedInt nelem, const int transpose,
250                                   const CeedScalar *d_B,
251                                   const CeedScalar *__restrict__ d_U,
252                                   CeedScalar *__restrict__ d_V) {
253   const int tid = threadIdx.x;
254 
255   const CeedScalar *U;
256   CeedScalar V;
257   //TODO load B in shared memory if blockDim.z > 1?
258 
259   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
260        elem += gridDim.x*blockDim.z) {
261     for (int comp = 0; comp < BASIS_NCOMP; comp++) {
262       if (!transpose) { // run with Q threads
263         U = d_U + elem*P + comp*nelem*P;
264         V = 0.0;
265         for (int i = 0; i < P; ++i)
266           V += d_B[i + tid*P]*U[i];
267 
268         d_V[elem*Q + comp*nelem*Q + tid] = V;
269       } else { // run with P threads
270         U = d_U + elem*Q + comp*nelem*Q;
271         V = 0.0;
272         for (int i = 0; i < Q; ++i)
273           V += d_B[tid + i*P]*U[i];
274 
275         d_V[elem*P + comp*nelem*P + tid] = V;
276       }
277     }
278   }
279 }
280 
281 //------------------------------------------------------------------------------
282 // Grad
283 //------------------------------------------------------------------------------
284 extern "C" __global__ void grad(const CeedInt nelem, const int transpose,
285                                 const CeedScalar *d_G,
286                                 const CeedScalar *__restrict__ d_U,
287                                 CeedScalar *__restrict__ d_V) {
288   const int tid = threadIdx.x;
289 
290   const CeedScalar *U;
291   //TODO load G in shared memory if blockDim.z > 1?
292 
293   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
294        elem += gridDim.x*blockDim.z) {
295     for (int comp=0; comp<BASIS_NCOMP; comp++) {
296       if (!transpose) { // run with Q threads
297         CeedScalar V[BASIS_DIM];
298         U = d_U + elem*P + comp*nelem*P;
299         for (int dim = 0; dim < BASIS_DIM; dim++)
300           V[dim] = 0.0;
301 
302         for (int i = 0; i < P; ++i) {
303           const CeedScalar val = U[i];
304           for(int dim = 0; dim < BASIS_DIM; dim++)
305             V[dim] += d_G[i + tid*P + dim*P*Q]*val;
306         }
307         for (int dim = 0; dim < BASIS_DIM; dim++) {
308           d_V[elem*Q + comp*nelem*Q + dim*BASIS_NCOMP*nelem*Q + tid] = V[dim];
309         }
310       } else { // run with P threads
311         CeedScalar V = 0.0;
312         for (int dim = 0; dim < BASIS_DIM; dim++) {
313           U = d_U + elem*Q + comp*nelem*Q +dim*BASIS_NCOMP*nelem*Q;
314           for (int i = 0; i < Q; ++i)
315             V += d_G[tid + i*P + dim*P*Q]*U[i];
316         }
317         d_V[elem*P + comp*nelem*P + tid] = V;
318       }
319     }
320   }
321 }
322 
323 //------------------------------------------------------------------------------
324 // Weight
325 //------------------------------------------------------------------------------
326 extern "C" __global__ void weight(const CeedInt nelem,
327                                   const CeedScalar *__restrict__ qweight,
328                                   CeedScalar *__restrict__ d_V) {
329   const int tid = threadIdx.x;
330   //TODO load qweight in shared memory if blockDim.z > 1?
331   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
332        elem += gridDim.x*blockDim.z) {
333     d_V[elem*Q + tid] = qweight[tid];
334   }
335 }
336 
337 );
338 // *INDENT-ON*
339 
340 //------------------------------------------------------------------------------
341 // Basis apply - tensor
342 //------------------------------------------------------------------------------
343 int CeedBasisApply_Hip(CeedBasis basis, const CeedInt nelem,
344                        CeedTransposeMode tmode,
345                        CeedEvalMode emode, CeedVector u, CeedVector v) {
346   int ierr;
347   Ceed ceed;
348   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
349   Ceed_Hip *ceed_Hip;
350   ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr);
351   CeedBasis_Hip *data;
352   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
353   const CeedInt transpose = tmode == CEED_TRANSPOSE;
354   const int maxblocksize = 64;
355 
356   // Read vectors
357   const CeedScalar *d_u;
358   CeedScalar *d_v;
359   if (emode != CEED_EVAL_WEIGHT) {
360     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
361   }
362   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
363 
364   // Clear v for transpose operation
365   if (tmode == CEED_TRANSPOSE) {
366     CeedInt length;
367     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
368     ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar));
369     CeedChk_Hip(ceed,ierr);
370   }
371 
372   // Basis action
373   switch (emode) {
374   case CEED_EVAL_INTERP: {
375     void *interpargs[] = {(void *) &nelem, (void *) &transpose,
376                           &data->d_interp1d, &d_u, &d_v
377                          };
378     CeedInt Q1d, dim;
379     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
380     ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
381     CeedInt blocksize = CeedIntPow(Q1d, dim);
382     blocksize = blocksize > maxblocksize ? maxblocksize : blocksize;
383 
384     ierr = CeedRunKernelHip(ceed, data->interp, nelem, blocksize, interpargs);
385     CeedChkBackend(ierr);
386   } break;
387   case CEED_EVAL_GRAD: {
388     void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_interp1d,
389                         &data->d_grad1d, &d_u, &d_v
390                        };
391     CeedInt blocksize = maxblocksize;
392 
393     ierr = CeedRunKernelHip(ceed, data->grad, nelem, blocksize, gradargs);
394     CeedChkBackend(ierr);
395   } break;
396   case CEED_EVAL_WEIGHT: {
397     void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight1d, &d_v};
398     const int blocksize = 64;
399     int gridsize = nelem/blocksize;
400     if (blocksize * gridsize < nelem)
401       gridsize += 1;
402 
403     ierr = CeedRunKernelHip(ceed, data->weight, gridsize, blocksize,
404                             weightargs); CeedChkBackend(ierr);
405   } break;
406   // LCOV_EXCL_START
407   // Evaluate the divergence to/from the quadrature points
408   case CEED_EVAL_DIV:
409     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
410   // Evaluate the curl to/from the quadrature points
411   case CEED_EVAL_CURL:
412     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
413   // Take no action, BasisApply should not have been called
414   case CEED_EVAL_NONE:
415     return CeedError(ceed, CEED_ERROR_BACKEND,
416                      "CEED_EVAL_NONE does not make sense in this context");
417     // LCOV_EXCL_STOP
418   }
419 
420   // Restore vectors
421   if (emode != CEED_EVAL_WEIGHT) {
422     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
423   }
424   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
425   return CEED_ERROR_SUCCESS;
426 }
427 
428 //------------------------------------------------------------------------------
429 // Basis apply - non-tensor
430 //------------------------------------------------------------------------------
431 int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt nelem,
432                                 CeedTransposeMode tmode, CeedEvalMode emode,
433                                 CeedVector u, CeedVector v) {
434   int ierr;
435   Ceed ceed;
436   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
437   Ceed_Hip *ceed_Hip;
438   ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr);
439   CeedBasisNonTensor_Hip *data;
440   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
441   CeedInt nnodes, nqpt;
442   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
443   ierr = CeedBasisGetNumNodes(basis, &nnodes); CeedChkBackend(ierr);
444   const CeedInt transpose = tmode == CEED_TRANSPOSE;
445   int elemsPerBlock = 1;
446   int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
447 
448   // Read vectors
449   const CeedScalar *d_u;
450   CeedScalar *d_v;
451   if (emode != CEED_EVAL_WEIGHT) {
452     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
453   }
454   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
455 
456   // Clear v for transpose operation
457   if (tmode == CEED_TRANSPOSE) {
458     CeedInt length;
459     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
460     ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar));
461     CeedChk_Hip(ceed, ierr);
462   }
463 
464   // Apply basis operation
465   switch (emode) {
466   case CEED_EVAL_INTERP: {
467     void *interpargs[] = {(void *) &nelem, (void *) &transpose,
468                           &data->d_interp, &d_u, &d_v
469                          };
470     if (!transpose) {
471       ierr = CeedRunKernelDimHip(ceed, data->interp, grid, nqpt, 1,
472                                  elemsPerBlock, interpargs); CeedChkBackend(ierr);
473     } else {
474       ierr = CeedRunKernelDimHip(ceed, data->interp, grid, nnodes, 1,
475                                  elemsPerBlock, interpargs); CeedChkBackend(ierr);
476     }
477   } break;
478   case CEED_EVAL_GRAD: {
479     void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_grad,
480                         &d_u, &d_v
481                        };
482     if (!transpose) {
483       ierr = CeedRunKernelDimHip(ceed, data->grad, grid, nqpt, 1,
484                                  elemsPerBlock, gradargs); CeedChkBackend(ierr);
485     } else {
486       ierr = CeedRunKernelDimHip(ceed, data->grad, grid, nnodes, 1,
487                                  elemsPerBlock, gradargs); CeedChkBackend(ierr);
488     }
489   } break;
490   case CEED_EVAL_WEIGHT: {
491     void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight, &d_v};
492     ierr = CeedRunKernelDimHip(ceed, data->weight, grid, nqpt, 1,
493                                elemsPerBlock, weightargs); CeedChkBackend(ierr);
494   } break;
495   // LCOV_EXCL_START
496   // Evaluate the divergence to/from the quadrature points
497   case CEED_EVAL_DIV:
498     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
499   // Evaluate the curl to/from the quadrature points
500   case CEED_EVAL_CURL:
501     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
502   // Take no action, BasisApply should not have been called
503   case CEED_EVAL_NONE:
504     return CeedError(ceed, CEED_ERROR_BACKEND,
505                      "CEED_EVAL_NONE does not make sense in this context");
506     // LCOV_EXCL_STOP
507   }
508 
509   // Restore vectors
510   if (emode != CEED_EVAL_WEIGHT) {
511     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
512   }
513   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
514   return CEED_ERROR_SUCCESS;
515 }
516 
517 //------------------------------------------------------------------------------
518 // Destroy tensor basis
519 //------------------------------------------------------------------------------
520 static int CeedBasisDestroy_Hip(CeedBasis basis) {
521   int ierr;
522   Ceed ceed;
523   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
524 
525   CeedBasis_Hip *data;
526   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
527 
528   CeedChk_Hip(ceed, hipModuleUnload(data->module));
529 
530   ierr = hipFree(data->d_qweight1d); CeedChk_Hip(ceed,ierr);
531   ierr = hipFree(data->d_interp1d); CeedChk_Hip(ceed,ierr);
532   ierr = hipFree(data->d_grad1d); CeedChk_Hip(ceed,ierr);
533 
534   ierr = CeedFree(&data); CeedChkBackend(ierr);
535   return CEED_ERROR_SUCCESS;
536 }
537 
538 //------------------------------------------------------------------------------
539 // Destroy non-tensor basis
540 //------------------------------------------------------------------------------
541 static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) {
542   int ierr;
543   Ceed ceed;
544   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
545 
546   CeedBasisNonTensor_Hip *data;
547   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
548 
549   CeedChk_Hip(ceed, hipModuleUnload(data->module));
550 
551   ierr = hipFree(data->d_qweight); CeedChk_Hip(ceed, ierr);
552   ierr = hipFree(data->d_interp); CeedChk_Hip(ceed, ierr);
553   ierr = hipFree(data->d_grad); CeedChk_Hip(ceed, ierr);
554 
555   ierr = CeedFree(&data); CeedChkBackend(ierr);
556   return CEED_ERROR_SUCCESS;
557 }
558 
559 //------------------------------------------------------------------------------
560 // Create tensor
561 //------------------------------------------------------------------------------
562 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P1d, CeedInt Q1d,
563                                 const CeedScalar *interp1d,
564                                 const CeedScalar *grad1d,
565                                 const CeedScalar *qref1d,
566                                 const CeedScalar *qweight1d,
567                                 CeedBasis basis) {
568   int ierr;
569   Ceed ceed;
570   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
571   CeedBasis_Hip *data;
572   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
573 
574   // Copy data to GPU
575   const CeedInt qBytes = Q1d * sizeof(CeedScalar);
576   ierr = hipMalloc((void **)&data->d_qweight1d, qBytes); CeedChk_Hip(ceed,ierr);
577   ierr = hipMemcpy(data->d_qweight1d, qweight1d, qBytes,
578                    hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr);
579 
580   const CeedInt iBytes = qBytes * P1d;
581   ierr = hipMalloc((void **)&data->d_interp1d, iBytes); CeedChk_Hip(ceed,ierr);
582   ierr = hipMemcpy(data->d_interp1d, interp1d, iBytes,
583                    hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr);
584 
585   ierr = hipMalloc((void **)&data->d_grad1d, iBytes); CeedChk_Hip(ceed,ierr);
586   ierr = hipMemcpy(data->d_grad1d, grad1d, iBytes,
587                    hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr);
588 
589   // Complie basis kernels
590   CeedInt ncomp;
591   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
592   ierr = CeedCompileHip(ceed, basiskernels, &data->module, 7,
593                         "BASIS_Q1D", Q1d,
594                         "BASIS_P1D", P1d,
595                         "BASIS_BUF_LEN", ncomp * CeedIntPow(Q1d > P1d ?
596                             Q1d : P1d, dim),
597                         "BASIS_DIM", dim,
598                         "BASIS_NCOMP", ncomp,
599                         "BASIS_ELEMSIZE", CeedIntPow(P1d, dim),
600                         "BASIS_NQPT", CeedIntPow(Q1d, dim)
601                        ); CeedChkBackend(ierr);
602   ierr = CeedGetKernelHip(ceed, data->module, "interp", &data->interp);
603   CeedChkBackend(ierr);
604   ierr = CeedGetKernelHip(ceed, data->module, "grad", &data->grad);
605   CeedChkBackend(ierr);
606   ierr = CeedGetKernelHip(ceed, data->module, "weight", &data->weight);
607   CeedChkBackend(ierr);
608   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
609 
610   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
611                                 CeedBasisApply_Hip); CeedChkBackend(ierr);
612   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
613                                 CeedBasisDestroy_Hip); CeedChkBackend(ierr);
614   return CEED_ERROR_SUCCESS;
615 }
616 
617 //------------------------------------------------------------------------------
618 // Create non-tensor
619 //------------------------------------------------------------------------------
620 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt nnodes,
621                           CeedInt nqpts, const CeedScalar *interp,
622                           const CeedScalar *grad, const CeedScalar *qref,
623                           const CeedScalar *qweight, CeedBasis basis) {
624   int ierr;
625   Ceed ceed;
626   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
627   CeedBasisNonTensor_Hip *data;
628   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
629 
630   // Copy basis data to GPU
631   const CeedInt qBytes = nqpts * sizeof(CeedScalar);
632   ierr = hipMalloc((void **)&data->d_qweight, qBytes); CeedChk_Hip(ceed, ierr);
633   ierr = hipMemcpy(data->d_qweight, qweight, qBytes,
634                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
635 
636   const CeedInt iBytes = qBytes * nnodes;
637   ierr = hipMalloc((void **)&data->d_interp, iBytes); CeedChk_Hip(ceed, ierr);
638   ierr = hipMemcpy(data->d_interp, interp, iBytes,
639                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
640 
641   const CeedInt gBytes = qBytes * nnodes * dim;
642   ierr = hipMalloc((void **)&data->d_grad, gBytes); CeedChk_Hip(ceed, ierr);
643   ierr = hipMemcpy(data->d_grad, grad, gBytes,
644                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
645 
646   // Compile basis kernels
647   CeedInt ncomp;
648   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
649   ierr = CeedCompileHip(ceed, kernelsNonTensorRef, &data->module, 4,
650                         "Q", nqpts,
651                         "P", nnodes,
652                         "BASIS_DIM", dim,
653                         "BASIS_NCOMP", ncomp
654                        ); CeedChk_Hip(ceed, ierr);
655   ierr = CeedGetKernelHip(ceed, data->module, "interp", &data->interp);
656   CeedChk_Hip(ceed, ierr);
657   ierr = CeedGetKernelHip(ceed, data->module, "grad", &data->grad);
658   CeedChk_Hip(ceed, ierr);
659   ierr = CeedGetKernelHip(ceed, data->module, "weight", &data->weight);
660   CeedChk_Hip(ceed, ierr);
661 
662   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
663 
664   // Register backend functions
665   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
666                                 CeedBasisApplyNonTensor_Hip); CeedChkBackend(ierr);
667   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
668                                 CeedBasisDestroyNonTensor_Hip); CeedChkBackend(ierr);
669   return CEED_ERROR_SUCCESS;
670 }
671 //------------------------------------------------------------------------------
672