xref: /libCEED/rust/libceed-sys/c-src/backends/hip-ref/ceed-hip-ref-basis.c (revision 0d0321e0e600f17fbb9528732fcb5c1d5c63fc0f)
1*0d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2*0d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3*0d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details.
4*0d0321e0SJeremy L Thompson //
5*0d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software
6*0d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral
7*0d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and
8*0d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed.
9*0d0321e0SJeremy L Thompson //
10*0d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11*0d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office
12*0d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for
13*0d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including
14*0d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early
15*0d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative.
16*0d0321e0SJeremy L Thompson 
17*0d0321e0SJeremy L Thompson #include <ceed/ceed.h>
18*0d0321e0SJeremy L Thompson #include <ceed/backend.h>
19*0d0321e0SJeremy L Thompson #include <hip/hip_runtime.h>
20*0d0321e0SJeremy L Thompson #include "ceed-hip-ref.h"
21*0d0321e0SJeremy L Thompson #include "../hip/ceed-hip-compile.h"
22*0d0321e0SJeremy L Thompson 
23*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
24*0d0321e0SJeremy L Thompson // Tensor Basis Kernels
25*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
26*0d0321e0SJeremy L Thompson // *INDENT-OFF*
27*0d0321e0SJeremy L Thompson static const char *basiskernels = QUOTE(
28*0d0321e0SJeremy L Thompson 
29*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
30*0d0321e0SJeremy L Thompson // Interp
31*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
32*0d0321e0SJeremy L Thompson extern "C" __global__ void interp(const CeedInt nelem, const int transpose,
33*0d0321e0SJeremy L Thompson                                   const CeedScalar *__restrict__ interp1d,
34*0d0321e0SJeremy L Thompson                                   const CeedScalar *__restrict__ u,
35*0d0321e0SJeremy L Thompson                                   CeedScalar *__restrict__ v) {
36*0d0321e0SJeremy L Thompson   const CeedInt i = threadIdx.x;
37*0d0321e0SJeremy L Thompson 
38*0d0321e0SJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q1D * BASIS_P1D + 2 * BASIS_BUF_LEN];
39*0d0321e0SJeremy L Thompson   CeedScalar *s_interp1d = s_mem;
40*0d0321e0SJeremy L Thompson   CeedScalar *s_buf1 = s_mem + BASIS_Q1D * BASIS_P1D;
41*0d0321e0SJeremy L Thompson   CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN;
42*0d0321e0SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) {
43*0d0321e0SJeremy L Thompson     s_interp1d[k] = interp1d[k];
44*0d0321e0SJeremy L Thompson   }
45*0d0321e0SJeremy L Thompson 
46*0d0321e0SJeremy L Thompson   const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D;
47*0d0321e0SJeremy L Thompson   const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D;
48*0d0321e0SJeremy L Thompson   const CeedInt stride0 = transpose ? 1 : BASIS_P1D;
49*0d0321e0SJeremy L Thompson   const CeedInt stride1 = transpose ? BASIS_P1D : 1;
50*0d0321e0SJeremy L Thompson   const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
51*0d0321e0SJeremy L Thompson   const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT;
52*0d0321e0SJeremy L Thompson   const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE);
53*0d0321e0SJeremy L Thompson   const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT);
54*0d0321e0SJeremy L Thompson   const CeedInt u_size = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
55*0d0321e0SJeremy L Thompson 
56*0d0321e0SJeremy L Thompson   // Apply basis element by element
57*0d0321e0SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) {
58*0d0321e0SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) {
59*0d0321e0SJeremy L Thompson       const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
60*0d0321e0SJeremy L Thompson       CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
61*0d0321e0SJeremy L Thompson       for (CeedInt k = i; k < u_size; k += blockDim.x) {
62*0d0321e0SJeremy L Thompson         s_buf1[k] = cur_u[k];
63*0d0321e0SJeremy L Thompson       }
64*0d0321e0SJeremy L Thompson       CeedInt pre = u_size;
65*0d0321e0SJeremy L Thompson       CeedInt post = 1;
66*0d0321e0SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
67*0d0321e0SJeremy L Thompson         __syncthreads();
68*0d0321e0SJeremy L Thompson         // Update buffers used
69*0d0321e0SJeremy L Thompson         pre /= P;
70*0d0321e0SJeremy L Thompson         const CeedScalar *in = d % 2 ? s_buf2 : s_buf1;
71*0d0321e0SJeremy L Thompson         CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buf1 : s_buf2);
72*0d0321e0SJeremy L Thompson 
73*0d0321e0SJeremy L Thompson         // Contract along middle index
74*0d0321e0SJeremy L Thompson         const CeedInt writeLen = pre * post * Q;
75*0d0321e0SJeremy L Thompson         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
76*0d0321e0SJeremy L Thompson           const CeedInt c = k % post;
77*0d0321e0SJeremy L Thompson           const CeedInt j = (k / post) % Q;
78*0d0321e0SJeremy L Thompson           const CeedInt a = k / (post * Q);
79*0d0321e0SJeremy L Thompson 
80*0d0321e0SJeremy L Thompson           CeedScalar vk = 0;
81*0d0321e0SJeremy L Thompson           for (CeedInt b = 0; b < P; b++)
82*0d0321e0SJeremy L Thompson             vk += s_interp1d[j*stride0 + b*stride1] * in[(a*P + b)*post + c];
83*0d0321e0SJeremy L Thompson 
84*0d0321e0SJeremy L Thompson           out[k] = vk;
85*0d0321e0SJeremy L Thompson         }
86*0d0321e0SJeremy L Thompson 
87*0d0321e0SJeremy L Thompson         post *= Q;
88*0d0321e0SJeremy L Thompson       }
89*0d0321e0SJeremy L Thompson     }
90*0d0321e0SJeremy L Thompson   }
91*0d0321e0SJeremy L Thompson }
92*0d0321e0SJeremy L Thompson 
93*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
94*0d0321e0SJeremy L Thompson // Grad
95*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
96*0d0321e0SJeremy L Thompson extern "C" __global__ void grad(const CeedInt nelem, const int transpose,
97*0d0321e0SJeremy L Thompson                                 const CeedScalar *__restrict__ interp1d,
98*0d0321e0SJeremy L Thompson                                 const CeedScalar *__restrict__ grad1d,
99*0d0321e0SJeremy L Thompson                                 const CeedScalar *__restrict__ u,
100*0d0321e0SJeremy L Thompson                                 CeedScalar *__restrict__ v) {
101*0d0321e0SJeremy L Thompson   const CeedInt i = threadIdx.x;
102*0d0321e0SJeremy L Thompson 
103*0d0321e0SJeremy L Thompson   __shared__ CeedScalar s_mem[2 * (BASIS_Q1D * BASIS_P1D + BASIS_BUF_LEN)];
104*0d0321e0SJeremy L Thompson   CeedScalar *s_interp1d = s_mem;
105*0d0321e0SJeremy L Thompson   CeedScalar *s_grad1d = s_interp1d + BASIS_Q1D * BASIS_P1D;
106*0d0321e0SJeremy L Thompson   CeedScalar *s_buf1 = s_grad1d + BASIS_Q1D * BASIS_P1D;
107*0d0321e0SJeremy L Thompson   CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN;
108*0d0321e0SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) {
109*0d0321e0SJeremy L Thompson     s_interp1d[k] = interp1d[k];
110*0d0321e0SJeremy L Thompson     s_grad1d[k] = grad1d[k];
111*0d0321e0SJeremy L Thompson   }
112*0d0321e0SJeremy L Thompson 
113*0d0321e0SJeremy L Thompson   const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D;
114*0d0321e0SJeremy L Thompson   const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D;
115*0d0321e0SJeremy L Thompson   const CeedInt stride0 = transpose ? 1 : BASIS_P1D;
116*0d0321e0SJeremy L Thompson   const CeedInt stride1 = transpose ? BASIS_P1D : 1;
117*0d0321e0SJeremy L Thompson   const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
118*0d0321e0SJeremy L Thompson   const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT;
119*0d0321e0SJeremy L Thompson   const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE);
120*0d0321e0SJeremy L Thompson   const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT);
121*0d0321e0SJeremy L Thompson   const CeedInt u_dim_stride = transpose ? nelem * BASIS_NQPT * BASIS_NCOMP : 0;
122*0d0321e0SJeremy L Thompson   const CeedInt v_dim_stride = transpose ? 0 : nelem * BASIS_NQPT * BASIS_NCOMP;
123*0d0321e0SJeremy L Thompson 
124*0d0321e0SJeremy L Thompson   // Apply basis element by element
125*0d0321e0SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) {
126*0d0321e0SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) {
127*0d0321e0SJeremy L Thompson 
128*0d0321e0SJeremy L Thompson       // dim*dim contractions for grad
129*0d0321e0SJeremy L Thompson       for (CeedInt dim1 = 0; dim1 < BASIS_DIM; dim1++) {
130*0d0321e0SJeremy L Thompson         CeedInt pre = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
131*0d0321e0SJeremy L Thompson         CeedInt post = 1;
132*0d0321e0SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + dim1 * u_dim_stride +
133*0d0321e0SJeremy L Thompson                                   comp * u_comp_stride;
134*0d0321e0SJeremy L Thompson         CeedScalar *cur_v = v + elem * v_stride + dim1 * v_dim_stride + comp *
135*0d0321e0SJeremy L Thompson                             v_comp_stride;
136*0d0321e0SJeremy L Thompson         for (CeedInt dim2 = 0; dim2 < BASIS_DIM; dim2++) {
137*0d0321e0SJeremy L Thompson           __syncthreads();
138*0d0321e0SJeremy L Thompson           // Update buffers used
139*0d0321e0SJeremy L Thompson           pre /= P;
140*0d0321e0SJeremy L Thompson           const CeedScalar *op = dim1 == dim2 ? s_grad1d : s_interp1d;
141*0d0321e0SJeremy L Thompson           const CeedScalar *in = dim2 == 0 ? cur_u : (dim2 % 2 ? s_buf2 : s_buf1);
142*0d0321e0SJeremy L Thompson           CeedScalar *out = dim2 == BASIS_DIM - 1 ? cur_v : (dim2 % 2 ? s_buf1 : s_buf2);
143*0d0321e0SJeremy L Thompson 
144*0d0321e0SJeremy L Thompson           // Contract along middle index
145*0d0321e0SJeremy L Thompson           const CeedInt writeLen = pre * post * Q;
146*0d0321e0SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
147*0d0321e0SJeremy L Thompson             const CeedInt c = k % post;
148*0d0321e0SJeremy L Thompson             const CeedInt j = (k / post) % Q;
149*0d0321e0SJeremy L Thompson             const CeedInt a = k / (post * Q);
150*0d0321e0SJeremy L Thompson             CeedScalar vk = 0;
151*0d0321e0SJeremy L Thompson             for (CeedInt b = 0; b < P; b++)
152*0d0321e0SJeremy L Thompson               vk += op[j * stride0 + b * stride1] * in[(a * P + b) * post + c];
153*0d0321e0SJeremy L Thompson 
154*0d0321e0SJeremy L Thompson             if (transpose && dim2 == BASIS_DIM - 1)
155*0d0321e0SJeremy L Thompson               out[k] += vk;
156*0d0321e0SJeremy L Thompson             else
157*0d0321e0SJeremy L Thompson               out[k] = vk;
158*0d0321e0SJeremy L Thompson           }
159*0d0321e0SJeremy L Thompson 
160*0d0321e0SJeremy L Thompson           post *= Q;
161*0d0321e0SJeremy L Thompson         }
162*0d0321e0SJeremy L Thompson       }
163*0d0321e0SJeremy L Thompson     }
164*0d0321e0SJeremy L Thompson   }
165*0d0321e0SJeremy L Thompson }
166*0d0321e0SJeremy L Thompson 
167*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
168*0d0321e0SJeremy L Thompson // 1D quadrature weights
169*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
170*0d0321e0SJeremy L Thompson __device__ void weight1d(const CeedInt nelem, const CeedScalar *qweight1d,
171*0d0321e0SJeremy L Thompson                          CeedScalar *w) {
172*0d0321e0SJeremy L Thompson   CeedScalar w1d[BASIS_Q1D];
173*0d0321e0SJeremy L Thompson   for (int i = 0; i < BASIS_Q1D; ++i)
174*0d0321e0SJeremy L Thompson     w1d[i] = qweight1d[i];
175*0d0321e0SJeremy L Thompson 
176*0d0321e0SJeremy L Thompson   for (int e = blockIdx.x * blockDim.x + threadIdx.x;
177*0d0321e0SJeremy L Thompson        e < nelem;
178*0d0321e0SJeremy L Thompson        e += blockDim.x * gridDim.x)
179*0d0321e0SJeremy L Thompson     for (int i = 0; i < BASIS_Q1D; ++i) {
180*0d0321e0SJeremy L Thompson       const int ind = e*BASIS_Q1D + i; // sequential
181*0d0321e0SJeremy L Thompson       w[ind] = w1d[i];
182*0d0321e0SJeremy L Thompson     }
183*0d0321e0SJeremy L Thompson }
184*0d0321e0SJeremy L Thompson 
185*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
186*0d0321e0SJeremy L Thompson // 2D quadrature weights
187*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
188*0d0321e0SJeremy L Thompson __device__ void weight2d(const CeedInt nelem, const CeedScalar *qweight1d,
189*0d0321e0SJeremy L Thompson                          CeedScalar *w) {
190*0d0321e0SJeremy L Thompson   CeedScalar w1d[BASIS_Q1D];
191*0d0321e0SJeremy L Thompson   for (int i = 0; i < BASIS_Q1D; ++i)
192*0d0321e0SJeremy L Thompson     w1d[i] = qweight1d[i];
193*0d0321e0SJeremy L Thompson 
194*0d0321e0SJeremy L Thompson   for (int e = blockIdx.x * blockDim.x + threadIdx.x;
195*0d0321e0SJeremy L Thompson        e < nelem;
196*0d0321e0SJeremy L Thompson        e += blockDim.x * gridDim.x)
197*0d0321e0SJeremy L Thompson     for (int i = 0; i < BASIS_Q1D; ++i)
198*0d0321e0SJeremy L Thompson       for (int j = 0; j < BASIS_Q1D; ++j) {
199*0d0321e0SJeremy L Thompson         const int ind = e*BASIS_Q1D*BASIS_Q1D + i + j*BASIS_Q1D; // sequential
200*0d0321e0SJeremy L Thompson         w[ind] = w1d[i]*w1d[j];
201*0d0321e0SJeremy L Thompson       }
202*0d0321e0SJeremy L Thompson }
203*0d0321e0SJeremy L Thompson 
204*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
205*0d0321e0SJeremy L Thompson // 3D quadrature weights
206*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
207*0d0321e0SJeremy L Thompson __device__ void weight3d(const CeedInt nelem, const CeedScalar *qweight1d,
208*0d0321e0SJeremy L Thompson                          CeedScalar *w) {
209*0d0321e0SJeremy L Thompson   CeedScalar w1d[BASIS_Q1D];
210*0d0321e0SJeremy L Thompson   for (int i = 0; i < BASIS_Q1D; ++i)
211*0d0321e0SJeremy L Thompson     w1d[i] = qweight1d[i];
212*0d0321e0SJeremy L Thompson 
213*0d0321e0SJeremy L Thompson   for (int e = blockIdx.x * blockDim.x + threadIdx.x;
214*0d0321e0SJeremy L Thompson        e < nelem;
215*0d0321e0SJeremy L Thompson        e += blockDim.x * gridDim.x)
216*0d0321e0SJeremy L Thompson     for (int i = 0; i < BASIS_Q1D; ++i)
217*0d0321e0SJeremy L Thompson       for (int j = 0; j < BASIS_Q1D; ++j)
218*0d0321e0SJeremy L Thompson         for (int k = 0; k < BASIS_Q1D; ++k) {
219*0d0321e0SJeremy L Thompson           const int ind = e*BASIS_Q1D*BASIS_Q1D*BASIS_Q1D + i + j*BASIS_Q1D +
220*0d0321e0SJeremy L Thompson                           k*BASIS_Q1D*BASIS_Q1D; // sequential
221*0d0321e0SJeremy L Thompson           w[ind] = w1d[i]*w1d[j]*w1d[k];
222*0d0321e0SJeremy L Thompson         }
223*0d0321e0SJeremy L Thompson }
224*0d0321e0SJeremy L Thompson 
225*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
226*0d0321e0SJeremy L Thompson // Quadrature weights
227*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
228*0d0321e0SJeremy L Thompson extern "C" __global__ void weight(const CeedInt nelem,
229*0d0321e0SJeremy L Thompson                                   const CeedScalar *__restrict__ qweight1d,
230*0d0321e0SJeremy L Thompson                                   CeedScalar *__restrict__ v) {
231*0d0321e0SJeremy L Thompson   if (BASIS_DIM==1)
232*0d0321e0SJeremy L Thompson     weight1d(nelem, qweight1d, v);
233*0d0321e0SJeremy L Thompson   else if (BASIS_DIM==2)
234*0d0321e0SJeremy L Thompson     weight2d(nelem, qweight1d, v);
235*0d0321e0SJeremy L Thompson   else if (BASIS_DIM==3)
236*0d0321e0SJeremy L Thompson     weight3d(nelem, qweight1d, v);
237*0d0321e0SJeremy L Thompson }
238*0d0321e0SJeremy L Thompson 
239*0d0321e0SJeremy L Thompson );
240*0d0321e0SJeremy L Thompson 
241*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
242*0d0321e0SJeremy L Thompson // Non-Tensor Basis Kernels
243*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
244*0d0321e0SJeremy L Thompson static const char *kernelsNonTensorRef = QUOTE(
245*0d0321e0SJeremy L Thompson 
246*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
247*0d0321e0SJeremy L Thompson // Interp
248*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
249*0d0321e0SJeremy L Thompson extern "C" __global__ void interp(const CeedInt nelem, const int transpose,
250*0d0321e0SJeremy L Thompson                                   const CeedScalar *d_B,
251*0d0321e0SJeremy L Thompson                                   const CeedScalar *__restrict__ d_U,
252*0d0321e0SJeremy L Thompson                                   CeedScalar *__restrict__ d_V) {
253*0d0321e0SJeremy L Thompson   const int tid = threadIdx.x;
254*0d0321e0SJeremy L Thompson 
255*0d0321e0SJeremy L Thompson   const CeedScalar *U;
256*0d0321e0SJeremy L Thompson   CeedScalar V;
257*0d0321e0SJeremy L Thompson   //TODO load B in shared memory if blockDim.z > 1?
258*0d0321e0SJeremy L Thompson 
259*0d0321e0SJeremy L Thompson   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
260*0d0321e0SJeremy L Thompson        elem += gridDim.x*blockDim.z) {
261*0d0321e0SJeremy L Thompson     for (int comp = 0; comp < BASIS_NCOMP; comp++) {
262*0d0321e0SJeremy L Thompson       if (!transpose) { // run with Q threads
263*0d0321e0SJeremy L Thompson         U = d_U + elem*P + comp*nelem*P;
264*0d0321e0SJeremy L Thompson         V = 0.0;
265*0d0321e0SJeremy L Thompson         for (int i = 0; i < P; ++i)
266*0d0321e0SJeremy L Thompson           V += d_B[i + tid*P]*U[i];
267*0d0321e0SJeremy L Thompson 
268*0d0321e0SJeremy L Thompson         d_V[elem*Q + comp*nelem*Q + tid] = V;
269*0d0321e0SJeremy L Thompson       } else { // run with P threads
270*0d0321e0SJeremy L Thompson         U = d_U + elem*Q + comp*nelem*Q;
271*0d0321e0SJeremy L Thompson         V = 0.0;
272*0d0321e0SJeremy L Thompson         for (int i = 0; i < Q; ++i)
273*0d0321e0SJeremy L Thompson           V += d_B[tid + i*P]*U[i];
274*0d0321e0SJeremy L Thompson 
275*0d0321e0SJeremy L Thompson         d_V[elem*P + comp*nelem*P + tid] = V;
276*0d0321e0SJeremy L Thompson       }
277*0d0321e0SJeremy L Thompson     }
278*0d0321e0SJeremy L Thompson   }
279*0d0321e0SJeremy L Thompson }
280*0d0321e0SJeremy L Thompson 
281*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
282*0d0321e0SJeremy L Thompson // Grad
283*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
284*0d0321e0SJeremy L Thompson extern "C" __global__ void grad(const CeedInt nelem, const int transpose,
285*0d0321e0SJeremy L Thompson                                 const CeedScalar *d_G,
286*0d0321e0SJeremy L Thompson                                 const CeedScalar *__restrict__ d_U,
287*0d0321e0SJeremy L Thompson                                 CeedScalar *__restrict__ d_V) {
288*0d0321e0SJeremy L Thompson   const int tid = threadIdx.x;
289*0d0321e0SJeremy L Thompson 
290*0d0321e0SJeremy L Thompson   const CeedScalar *U;
291*0d0321e0SJeremy L Thompson   //TODO load G in shared memory if blockDim.z > 1?
292*0d0321e0SJeremy L Thompson 
293*0d0321e0SJeremy L Thompson   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
294*0d0321e0SJeremy L Thompson        elem += gridDim.x*blockDim.z) {
295*0d0321e0SJeremy L Thompson     for (int comp=0; comp<BASIS_NCOMP; comp++) {
296*0d0321e0SJeremy L Thompson       if (!transpose) { // run with Q threads
297*0d0321e0SJeremy L Thompson         CeedScalar V[BASIS_DIM];
298*0d0321e0SJeremy L Thompson         U = d_U + elem*P + comp*nelem*P;
299*0d0321e0SJeremy L Thompson         for (int dim = 0; dim < BASIS_DIM; dim++)
300*0d0321e0SJeremy L Thompson           V[dim] = 0.0;
301*0d0321e0SJeremy L Thompson 
302*0d0321e0SJeremy L Thompson         for (int i = 0; i < P; ++i) {
303*0d0321e0SJeremy L Thompson           const CeedScalar val = U[i];
304*0d0321e0SJeremy L Thompson           for(int dim = 0; dim < BASIS_DIM; dim++)
305*0d0321e0SJeremy L Thompson             V[dim] += d_G[i + tid*P + dim*P*Q]*val;
306*0d0321e0SJeremy L Thompson         }
307*0d0321e0SJeremy L Thompson         for (int dim = 0; dim < BASIS_DIM; dim++) {
308*0d0321e0SJeremy L Thompson           d_V[elem*Q + comp*nelem*Q + dim*BASIS_NCOMP*nelem*Q + tid] = V[dim];
309*0d0321e0SJeremy L Thompson         }
310*0d0321e0SJeremy L Thompson       } else { // run with P threads
311*0d0321e0SJeremy L Thompson         CeedScalar V = 0.0;
312*0d0321e0SJeremy L Thompson         for (int dim = 0; dim < BASIS_DIM; dim++) {
313*0d0321e0SJeremy L Thompson           U = d_U + elem*Q + comp*nelem*Q +dim*BASIS_NCOMP*nelem*Q;
314*0d0321e0SJeremy L Thompson           for (int i = 0; i < Q; ++i)
315*0d0321e0SJeremy L Thompson             V += d_G[tid + i*P + dim*P*Q]*U[i];
316*0d0321e0SJeremy L Thompson         }
317*0d0321e0SJeremy L Thompson         d_V[elem*P + comp*nelem*P + tid] = V;
318*0d0321e0SJeremy L Thompson       }
319*0d0321e0SJeremy L Thompson     }
320*0d0321e0SJeremy L Thompson   }
321*0d0321e0SJeremy L Thompson }
322*0d0321e0SJeremy L Thompson 
323*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
324*0d0321e0SJeremy L Thompson // Weight
325*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
326*0d0321e0SJeremy L Thompson extern "C" __global__ void weight(const CeedInt nelem,
327*0d0321e0SJeremy L Thompson                                   const CeedScalar *__restrict__ qweight,
328*0d0321e0SJeremy L Thompson                                   CeedScalar *__restrict__ d_V) {
329*0d0321e0SJeremy L Thompson   const int tid = threadIdx.x;
330*0d0321e0SJeremy L Thompson   //TODO load qweight in shared memory if blockDim.z > 1?
331*0d0321e0SJeremy L Thompson   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
332*0d0321e0SJeremy L Thompson        elem += gridDim.x*blockDim.z) {
333*0d0321e0SJeremy L Thompson     d_V[elem*Q + tid] = qweight[tid];
334*0d0321e0SJeremy L Thompson   }
335*0d0321e0SJeremy L Thompson }
336*0d0321e0SJeremy L Thompson 
337*0d0321e0SJeremy L Thompson );
338*0d0321e0SJeremy L Thompson // *INDENT-ON*
339*0d0321e0SJeremy L Thompson 
340*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
341*0d0321e0SJeremy L Thompson // Basis apply - tensor
342*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
343*0d0321e0SJeremy L Thompson int CeedBasisApply_Hip(CeedBasis basis, const CeedInt nelem,
344*0d0321e0SJeremy L Thompson                        CeedTransposeMode tmode,
345*0d0321e0SJeremy L Thompson                        CeedEvalMode emode, CeedVector u, CeedVector v) {
346*0d0321e0SJeremy L Thompson   int ierr;
347*0d0321e0SJeremy L Thompson   Ceed ceed;
348*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
349*0d0321e0SJeremy L Thompson   Ceed_Hip *ceed_Hip;
350*0d0321e0SJeremy L Thompson   ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr);
351*0d0321e0SJeremy L Thompson   CeedBasis_Hip *data;
352*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
353*0d0321e0SJeremy L Thompson   const CeedInt transpose = tmode == CEED_TRANSPOSE;
354*0d0321e0SJeremy L Thompson   const int maxblocksize = 64;
355*0d0321e0SJeremy L Thompson 
356*0d0321e0SJeremy L Thompson   // Read vectors
357*0d0321e0SJeremy L Thompson   const CeedScalar *d_u;
358*0d0321e0SJeremy L Thompson   CeedScalar *d_v;
359*0d0321e0SJeremy L Thompson   if (emode != CEED_EVAL_WEIGHT) {
360*0d0321e0SJeremy L Thompson     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
361*0d0321e0SJeremy L Thompson   }
362*0d0321e0SJeremy L Thompson   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
363*0d0321e0SJeremy L Thompson 
364*0d0321e0SJeremy L Thompson   // Clear v for transpose operation
365*0d0321e0SJeremy L Thompson   if (tmode == CEED_TRANSPOSE) {
366*0d0321e0SJeremy L Thompson     CeedInt length;
367*0d0321e0SJeremy L Thompson     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
368*0d0321e0SJeremy L Thompson     ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar));
369*0d0321e0SJeremy L Thompson     CeedChk_Hip(ceed,ierr);
370*0d0321e0SJeremy L Thompson   }
371*0d0321e0SJeremy L Thompson 
372*0d0321e0SJeremy L Thompson   // Basis action
373*0d0321e0SJeremy L Thompson   switch (emode) {
374*0d0321e0SJeremy L Thompson   case CEED_EVAL_INTERP: {
375*0d0321e0SJeremy L Thompson     void *interpargs[] = {(void *) &nelem, (void *) &transpose,
376*0d0321e0SJeremy L Thompson                           &data->d_interp1d, &d_u, &d_v
377*0d0321e0SJeremy L Thompson                          };
378*0d0321e0SJeremy L Thompson     CeedInt Q1d, dim;
379*0d0321e0SJeremy L Thompson     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
380*0d0321e0SJeremy L Thompson     ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
381*0d0321e0SJeremy L Thompson     CeedInt blocksize = CeedIntPow(Q1d, dim);
382*0d0321e0SJeremy L Thompson     blocksize = blocksize > maxblocksize ? maxblocksize : blocksize;
383*0d0321e0SJeremy L Thompson 
384*0d0321e0SJeremy L Thompson     ierr = CeedRunKernelHip(ceed, data->interp, nelem, blocksize, interpargs);
385*0d0321e0SJeremy L Thompson     CeedChkBackend(ierr);
386*0d0321e0SJeremy L Thompson   } break;
387*0d0321e0SJeremy L Thompson   case CEED_EVAL_GRAD: {
388*0d0321e0SJeremy L Thompson     void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_interp1d,
389*0d0321e0SJeremy L Thompson                         &data->d_grad1d, &d_u, &d_v
390*0d0321e0SJeremy L Thompson                        };
391*0d0321e0SJeremy L Thompson     CeedInt blocksize = maxblocksize;
392*0d0321e0SJeremy L Thompson 
393*0d0321e0SJeremy L Thompson     ierr = CeedRunKernelHip(ceed, data->grad, nelem, blocksize, gradargs);
394*0d0321e0SJeremy L Thompson     CeedChkBackend(ierr);
395*0d0321e0SJeremy L Thompson   } break;
396*0d0321e0SJeremy L Thompson   case CEED_EVAL_WEIGHT: {
397*0d0321e0SJeremy L Thompson     void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight1d, &d_v};
398*0d0321e0SJeremy L Thompson     const int blocksize = 64;
399*0d0321e0SJeremy L Thompson     int gridsize = nelem/blocksize;
400*0d0321e0SJeremy L Thompson     if (blocksize * gridsize < nelem)
401*0d0321e0SJeremy L Thompson       gridsize += 1;
402*0d0321e0SJeremy L Thompson 
403*0d0321e0SJeremy L Thompson     ierr = CeedRunKernelHip(ceed, data->weight, gridsize, blocksize,
404*0d0321e0SJeremy L Thompson                             weightargs); CeedChkBackend(ierr);
405*0d0321e0SJeremy L Thompson   } break;
406*0d0321e0SJeremy L Thompson   // LCOV_EXCL_START
407*0d0321e0SJeremy L Thompson   // Evaluate the divergence to/from the quadrature points
408*0d0321e0SJeremy L Thompson   case CEED_EVAL_DIV:
409*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
410*0d0321e0SJeremy L Thompson   // Evaluate the curl to/from the quadrature points
411*0d0321e0SJeremy L Thompson   case CEED_EVAL_CURL:
412*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
413*0d0321e0SJeremy L Thompson   // Take no action, BasisApply should not have been called
414*0d0321e0SJeremy L Thompson   case CEED_EVAL_NONE:
415*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
416*0d0321e0SJeremy L Thompson                      "CEED_EVAL_NONE does not make sense in this context");
417*0d0321e0SJeremy L Thompson     // LCOV_EXCL_STOP
418*0d0321e0SJeremy L Thompson   }
419*0d0321e0SJeremy L Thompson 
420*0d0321e0SJeremy L Thompson   // Restore vectors
421*0d0321e0SJeremy L Thompson   if (emode != CEED_EVAL_WEIGHT) {
422*0d0321e0SJeremy L Thompson     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
423*0d0321e0SJeremy L Thompson   }
424*0d0321e0SJeremy L Thompson   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
425*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
426*0d0321e0SJeremy L Thompson }
427*0d0321e0SJeremy L Thompson 
428*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
429*0d0321e0SJeremy L Thompson // Basis apply - non-tensor
430*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
431*0d0321e0SJeremy L Thompson int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt nelem,
432*0d0321e0SJeremy L Thompson                                 CeedTransposeMode tmode, CeedEvalMode emode,
433*0d0321e0SJeremy L Thompson                                 CeedVector u, CeedVector v) {
434*0d0321e0SJeremy L Thompson   int ierr;
435*0d0321e0SJeremy L Thompson   Ceed ceed;
436*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
437*0d0321e0SJeremy L Thompson   Ceed_Hip *ceed_Hip;
438*0d0321e0SJeremy L Thompson   ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr);
439*0d0321e0SJeremy L Thompson   CeedBasisNonTensor_Hip *data;
440*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
441*0d0321e0SJeremy L Thompson   CeedInt nnodes, nqpt;
442*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
443*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetNumNodes(basis, &nnodes); CeedChkBackend(ierr);
444*0d0321e0SJeremy L Thompson   const CeedInt transpose = tmode == CEED_TRANSPOSE;
445*0d0321e0SJeremy L Thompson   int elemsPerBlock = 1;
446*0d0321e0SJeremy L Thompson   int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
447*0d0321e0SJeremy L Thompson 
448*0d0321e0SJeremy L Thompson   // Read vectors
449*0d0321e0SJeremy L Thompson   const CeedScalar *d_u;
450*0d0321e0SJeremy L Thompson   CeedScalar *d_v;
451*0d0321e0SJeremy L Thompson   if (emode != CEED_EVAL_WEIGHT) {
452*0d0321e0SJeremy L Thompson     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
453*0d0321e0SJeremy L Thompson   }
454*0d0321e0SJeremy L Thompson   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
455*0d0321e0SJeremy L Thompson 
456*0d0321e0SJeremy L Thompson   // Clear v for transpose operation
457*0d0321e0SJeremy L Thompson   if (tmode == CEED_TRANSPOSE) {
458*0d0321e0SJeremy L Thompson     CeedInt length;
459*0d0321e0SJeremy L Thompson     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
460*0d0321e0SJeremy L Thompson     ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar));
461*0d0321e0SJeremy L Thompson     CeedChk_Hip(ceed, ierr);
462*0d0321e0SJeremy L Thompson   }
463*0d0321e0SJeremy L Thompson 
464*0d0321e0SJeremy L Thompson   // Apply basis operation
465*0d0321e0SJeremy L Thompson   switch (emode) {
466*0d0321e0SJeremy L Thompson   case CEED_EVAL_INTERP: {
467*0d0321e0SJeremy L Thompson     void *interpargs[] = {(void *) &nelem, (void *) &transpose,
468*0d0321e0SJeremy L Thompson                           &data->d_interp, &d_u, &d_v
469*0d0321e0SJeremy L Thompson                          };
470*0d0321e0SJeremy L Thompson     if (!transpose) {
471*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelDimHip(ceed, data->interp, grid, nqpt, 1,
472*0d0321e0SJeremy L Thompson                                  elemsPerBlock, interpargs); CeedChkBackend(ierr);
473*0d0321e0SJeremy L Thompson     } else {
474*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelDimHip(ceed, data->interp, grid, nnodes, 1,
475*0d0321e0SJeremy L Thompson                                  elemsPerBlock, interpargs); CeedChkBackend(ierr);
476*0d0321e0SJeremy L Thompson     }
477*0d0321e0SJeremy L Thompson   } break;
478*0d0321e0SJeremy L Thompson   case CEED_EVAL_GRAD: {
479*0d0321e0SJeremy L Thompson     void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_grad,
480*0d0321e0SJeremy L Thompson                         &d_u, &d_v
481*0d0321e0SJeremy L Thompson                        };
482*0d0321e0SJeremy L Thompson     if (!transpose) {
483*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelDimHip(ceed, data->grad, grid, nqpt, 1,
484*0d0321e0SJeremy L Thompson                                  elemsPerBlock, gradargs); CeedChkBackend(ierr);
485*0d0321e0SJeremy L Thompson     } else {
486*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelDimHip(ceed, data->grad, grid, nnodes, 1,
487*0d0321e0SJeremy L Thompson                                  elemsPerBlock, gradargs); CeedChkBackend(ierr);
488*0d0321e0SJeremy L Thompson     }
489*0d0321e0SJeremy L Thompson   } break;
490*0d0321e0SJeremy L Thompson   case CEED_EVAL_WEIGHT: {
491*0d0321e0SJeremy L Thompson     void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight, &d_v};
492*0d0321e0SJeremy L Thompson     ierr = CeedRunKernelDimHip(ceed, data->weight, grid, nqpt, 1,
493*0d0321e0SJeremy L Thompson                                elemsPerBlock, weightargs); CeedChkBackend(ierr);
494*0d0321e0SJeremy L Thompson   } break;
495*0d0321e0SJeremy L Thompson   // LCOV_EXCL_START
496*0d0321e0SJeremy L Thompson   // Evaluate the divergence to/from the quadrature points
497*0d0321e0SJeremy L Thompson   case CEED_EVAL_DIV:
498*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
499*0d0321e0SJeremy L Thompson   // Evaluate the curl to/from the quadrature points
500*0d0321e0SJeremy L Thompson   case CEED_EVAL_CURL:
501*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
502*0d0321e0SJeremy L Thompson   // Take no action, BasisApply should not have been called
503*0d0321e0SJeremy L Thompson   case CEED_EVAL_NONE:
504*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
505*0d0321e0SJeremy L Thompson                      "CEED_EVAL_NONE does not make sense in this context");
506*0d0321e0SJeremy L Thompson     // LCOV_EXCL_STOP
507*0d0321e0SJeremy L Thompson   }
508*0d0321e0SJeremy L Thompson 
509*0d0321e0SJeremy L Thompson   // Restore vectors
510*0d0321e0SJeremy L Thompson   if (emode != CEED_EVAL_WEIGHT) {
511*0d0321e0SJeremy L Thompson     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
512*0d0321e0SJeremy L Thompson   }
513*0d0321e0SJeremy L Thompson   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
514*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
515*0d0321e0SJeremy L Thompson }
516*0d0321e0SJeremy L Thompson 
517*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
518*0d0321e0SJeremy L Thompson // Destroy tensor basis
519*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
520*0d0321e0SJeremy L Thompson static int CeedBasisDestroy_Hip(CeedBasis basis) {
521*0d0321e0SJeremy L Thompson   int ierr;
522*0d0321e0SJeremy L Thompson   Ceed ceed;
523*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
524*0d0321e0SJeremy L Thompson 
525*0d0321e0SJeremy L Thompson   CeedBasis_Hip *data;
526*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
527*0d0321e0SJeremy L Thompson 
528*0d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, hipModuleUnload(data->module));
529*0d0321e0SJeremy L Thompson 
530*0d0321e0SJeremy L Thompson   ierr = hipFree(data->d_qweight1d); CeedChk_Hip(ceed,ierr);
531*0d0321e0SJeremy L Thompson   ierr = hipFree(data->d_interp1d); CeedChk_Hip(ceed,ierr);
532*0d0321e0SJeremy L Thompson   ierr = hipFree(data->d_grad1d); CeedChk_Hip(ceed,ierr);
533*0d0321e0SJeremy L Thompson 
534*0d0321e0SJeremy L Thompson   ierr = CeedFree(&data); CeedChkBackend(ierr);
535*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
536*0d0321e0SJeremy L Thompson }
537*0d0321e0SJeremy L Thompson 
538*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
539*0d0321e0SJeremy L Thompson // Destroy non-tensor basis
540*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
541*0d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) {
542*0d0321e0SJeremy L Thompson   int ierr;
543*0d0321e0SJeremy L Thompson   Ceed ceed;
544*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
545*0d0321e0SJeremy L Thompson 
546*0d0321e0SJeremy L Thompson   CeedBasisNonTensor_Hip *data;
547*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
548*0d0321e0SJeremy L Thompson 
549*0d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, hipModuleUnload(data->module));
550*0d0321e0SJeremy L Thompson 
551*0d0321e0SJeremy L Thompson   ierr = hipFree(data->d_qweight); CeedChk_Hip(ceed, ierr);
552*0d0321e0SJeremy L Thompson   ierr = hipFree(data->d_interp); CeedChk_Hip(ceed, ierr);
553*0d0321e0SJeremy L Thompson   ierr = hipFree(data->d_grad); CeedChk_Hip(ceed, ierr);
554*0d0321e0SJeremy L Thompson 
555*0d0321e0SJeremy L Thompson   ierr = CeedFree(&data); CeedChkBackend(ierr);
556*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
557*0d0321e0SJeremy L Thompson }
558*0d0321e0SJeremy L Thompson 
559*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
560*0d0321e0SJeremy L Thompson // Create tensor
561*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
562*0d0321e0SJeremy L Thompson int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P1d, CeedInt Q1d,
563*0d0321e0SJeremy L Thompson                                 const CeedScalar *interp1d,
564*0d0321e0SJeremy L Thompson                                 const CeedScalar *grad1d,
565*0d0321e0SJeremy L Thompson                                 const CeedScalar *qref1d,
566*0d0321e0SJeremy L Thompson                                 const CeedScalar *qweight1d,
567*0d0321e0SJeremy L Thompson                                 CeedBasis basis) {
568*0d0321e0SJeremy L Thompson   int ierr;
569*0d0321e0SJeremy L Thompson   Ceed ceed;
570*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
571*0d0321e0SJeremy L Thompson   CeedBasis_Hip *data;
572*0d0321e0SJeremy L Thompson   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
573*0d0321e0SJeremy L Thompson 
574*0d0321e0SJeremy L Thompson   // Copy data to GPU
575*0d0321e0SJeremy L Thompson   const CeedInt qBytes = Q1d * sizeof(CeedScalar);
576*0d0321e0SJeremy L Thompson   ierr = hipMalloc((void **)&data->d_qweight1d, qBytes); CeedChk_Hip(ceed,ierr);
577*0d0321e0SJeremy L Thompson   ierr = hipMemcpy(data->d_qweight1d, qweight1d, qBytes,
578*0d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr);
579*0d0321e0SJeremy L Thompson 
580*0d0321e0SJeremy L Thompson   const CeedInt iBytes = qBytes * P1d;
581*0d0321e0SJeremy L Thompson   ierr = hipMalloc((void **)&data->d_interp1d, iBytes); CeedChk_Hip(ceed,ierr);
582*0d0321e0SJeremy L Thompson   ierr = hipMemcpy(data->d_interp1d, interp1d, iBytes,
583*0d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr);
584*0d0321e0SJeremy L Thompson 
585*0d0321e0SJeremy L Thompson   ierr = hipMalloc((void **)&data->d_grad1d, iBytes); CeedChk_Hip(ceed,ierr);
586*0d0321e0SJeremy L Thompson   ierr = hipMemcpy(data->d_grad1d, grad1d, iBytes,
587*0d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr);
588*0d0321e0SJeremy L Thompson 
589*0d0321e0SJeremy L Thompson   // Complie basis kernels
590*0d0321e0SJeremy L Thompson   CeedInt ncomp;
591*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
592*0d0321e0SJeremy L Thompson   ierr = CeedCompileHip(ceed, basiskernels, &data->module, 7,
593*0d0321e0SJeremy L Thompson                         "BASIS_Q1D", Q1d,
594*0d0321e0SJeremy L Thompson                         "BASIS_P1D", P1d,
595*0d0321e0SJeremy L Thompson                         "BASIS_BUF_LEN", ncomp * CeedIntPow(Q1d > P1d ?
596*0d0321e0SJeremy L Thompson                             Q1d : P1d, dim),
597*0d0321e0SJeremy L Thompson                         "BASIS_DIM", dim,
598*0d0321e0SJeremy L Thompson                         "BASIS_NCOMP", ncomp,
599*0d0321e0SJeremy L Thompson                         "BASIS_ELEMSIZE", CeedIntPow(P1d, dim),
600*0d0321e0SJeremy L Thompson                         "BASIS_NQPT", CeedIntPow(Q1d, dim)
601*0d0321e0SJeremy L Thompson                        ); CeedChkBackend(ierr);
602*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, data->module, "interp", &data->interp);
603*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
604*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, data->module, "grad", &data->grad);
605*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
606*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, data->module, "weight", &data->weight);
607*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
608*0d0321e0SJeremy L Thompson   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
609*0d0321e0SJeremy L Thompson 
610*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
611*0d0321e0SJeremy L Thompson                                 CeedBasisApply_Hip); CeedChkBackend(ierr);
612*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
613*0d0321e0SJeremy L Thompson                                 CeedBasisDestroy_Hip); CeedChkBackend(ierr);
614*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
615*0d0321e0SJeremy L Thompson }
616*0d0321e0SJeremy L Thompson 
617*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
618*0d0321e0SJeremy L Thompson // Create non-tensor
619*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
620*0d0321e0SJeremy L Thompson int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt nnodes,
621*0d0321e0SJeremy L Thompson                           CeedInt nqpts, const CeedScalar *interp,
622*0d0321e0SJeremy L Thompson                           const CeedScalar *grad, const CeedScalar *qref,
623*0d0321e0SJeremy L Thompson                           const CeedScalar *qweight, CeedBasis basis) {
624*0d0321e0SJeremy L Thompson   int ierr;
625*0d0321e0SJeremy L Thompson   Ceed ceed;
626*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
627*0d0321e0SJeremy L Thompson   CeedBasisNonTensor_Hip *data;
628*0d0321e0SJeremy L Thompson   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
629*0d0321e0SJeremy L Thompson 
630*0d0321e0SJeremy L Thompson   // Copy basis data to GPU
631*0d0321e0SJeremy L Thompson   const CeedInt qBytes = nqpts * sizeof(CeedScalar);
632*0d0321e0SJeremy L Thompson   ierr = hipMalloc((void **)&data->d_qweight, qBytes); CeedChk_Hip(ceed, ierr);
633*0d0321e0SJeremy L Thompson   ierr = hipMemcpy(data->d_qweight, qweight, qBytes,
634*0d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
635*0d0321e0SJeremy L Thompson 
636*0d0321e0SJeremy L Thompson   const CeedInt iBytes = qBytes * nnodes;
637*0d0321e0SJeremy L Thompson   ierr = hipMalloc((void **)&data->d_interp, iBytes); CeedChk_Hip(ceed, ierr);
638*0d0321e0SJeremy L Thompson   ierr = hipMemcpy(data->d_interp, interp, iBytes,
639*0d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
640*0d0321e0SJeremy L Thompson 
641*0d0321e0SJeremy L Thompson   const CeedInt gBytes = qBytes * nnodes * dim;
642*0d0321e0SJeremy L Thompson   ierr = hipMalloc((void **)&data->d_grad, gBytes); CeedChk_Hip(ceed, ierr);
643*0d0321e0SJeremy L Thompson   ierr = hipMemcpy(data->d_grad, grad, gBytes,
644*0d0321e0SJeremy L Thompson                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
645*0d0321e0SJeremy L Thompson 
646*0d0321e0SJeremy L Thompson   // Compile basis kernels
647*0d0321e0SJeremy L Thompson   CeedInt ncomp;
648*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
649*0d0321e0SJeremy L Thompson   ierr = CeedCompileHip(ceed, kernelsNonTensorRef, &data->module, 4,
650*0d0321e0SJeremy L Thompson                         "Q", nqpts,
651*0d0321e0SJeremy L Thompson                         "P", nnodes,
652*0d0321e0SJeremy L Thompson                         "BASIS_DIM", dim,
653*0d0321e0SJeremy L Thompson                         "BASIS_NCOMP", ncomp
654*0d0321e0SJeremy L Thompson                        ); CeedChk_Hip(ceed, ierr);
655*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, data->module, "interp", &data->interp);
656*0d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
657*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, data->module, "grad", &data->grad);
658*0d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
659*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelHip(ceed, data->module, "weight", &data->weight);
660*0d0321e0SJeremy L Thompson   CeedChk_Hip(ceed, ierr);
661*0d0321e0SJeremy L Thompson 
662*0d0321e0SJeremy L Thompson   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
663*0d0321e0SJeremy L Thompson 
664*0d0321e0SJeremy L Thompson   // Register backend functions
665*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
666*0d0321e0SJeremy L Thompson                                 CeedBasisApplyNonTensor_Hip); CeedChkBackend(ierr);
667*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
668*0d0321e0SJeremy L Thompson                                 CeedBasisDestroyNonTensor_Hip); CeedChkBackend(ierr);
669*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
670*0d0321e0SJeremy L Thompson }
671*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
672