xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-basis.c (revision 0d0321e0e600f17fbb9528732fcb5c1d5c63fc0f)
1*0d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2*0d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3*0d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details.
4*0d0321e0SJeremy L Thompson //
5*0d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software
6*0d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral
7*0d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and
8*0d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed.
9*0d0321e0SJeremy L Thompson //
10*0d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11*0d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office
12*0d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for
13*0d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including
14*0d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early
15*0d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative.
16*0d0321e0SJeremy L Thompson 
17*0d0321e0SJeremy L Thompson #include <ceed/ceed.h>
18*0d0321e0SJeremy L Thompson #include <ceed/backend.h>
19*0d0321e0SJeremy L Thompson #include <cuda.h>
20*0d0321e0SJeremy L Thompson #include <cuda_runtime.h>
21*0d0321e0SJeremy L Thompson #include "ceed-cuda-ref.h"
22*0d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h"
23*0d0321e0SJeremy L Thompson 
24*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
25*0d0321e0SJeremy L Thompson // Tensor Basis Kernels
26*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
27*0d0321e0SJeremy L Thompson // *INDENT-OFF*
28*0d0321e0SJeremy L Thompson static const char *basiskernels = QUOTE(
29*0d0321e0SJeremy L Thompson 
30*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
31*0d0321e0SJeremy L Thompson // Interp
32*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
33*0d0321e0SJeremy L Thompson extern "C" __global__ void interp(const CeedInt nelem, const int transpose,
34*0d0321e0SJeremy L Thompson                                   const CeedScalar *__restrict__ interp1d,
35*0d0321e0SJeremy L Thompson                                   const CeedScalar *__restrict__ u,
36*0d0321e0SJeremy L Thompson                                   CeedScalar *__restrict__ v) {
37*0d0321e0SJeremy L Thompson   const CeedInt i = threadIdx.x;
38*0d0321e0SJeremy L Thompson 
39*0d0321e0SJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q1D * BASIS_P1D + 2 * BASIS_BUF_LEN];
40*0d0321e0SJeremy L Thompson   CeedScalar *s_interp1d = s_mem;
41*0d0321e0SJeremy L Thompson   CeedScalar *s_buf1 = s_mem + BASIS_Q1D * BASIS_P1D;
42*0d0321e0SJeremy L Thompson   CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN;
43*0d0321e0SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) {
44*0d0321e0SJeremy L Thompson     s_interp1d[k] = interp1d[k];
45*0d0321e0SJeremy L Thompson   }
46*0d0321e0SJeremy L Thompson 
47*0d0321e0SJeremy L Thompson   const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D;
48*0d0321e0SJeremy L Thompson   const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D;
49*0d0321e0SJeremy L Thompson   const CeedInt stride0 = transpose ? 1 : BASIS_P1D;
50*0d0321e0SJeremy L Thompson   const CeedInt stride1 = transpose ? BASIS_P1D : 1;
51*0d0321e0SJeremy L Thompson   const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
52*0d0321e0SJeremy L Thompson   const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT;
53*0d0321e0SJeremy L Thompson   const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE);
54*0d0321e0SJeremy L Thompson   const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT);
55*0d0321e0SJeremy L Thompson   const CeedInt u_size = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
56*0d0321e0SJeremy L Thompson 
57*0d0321e0SJeremy L Thompson   // Apply basis element by element
58*0d0321e0SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) {
59*0d0321e0SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) {
60*0d0321e0SJeremy L Thompson       const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride;
61*0d0321e0SJeremy L Thompson       CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride;
62*0d0321e0SJeremy L Thompson       for (CeedInt k = i; k < u_size; k += blockDim.x) {
63*0d0321e0SJeremy L Thompson         s_buf1[k] = cur_u[k];
64*0d0321e0SJeremy L Thompson       }
65*0d0321e0SJeremy L Thompson       CeedInt pre = u_size;
66*0d0321e0SJeremy L Thompson       CeedInt post = 1;
67*0d0321e0SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
68*0d0321e0SJeremy L Thompson         __syncthreads();
69*0d0321e0SJeremy L Thompson         // Update buffers used
70*0d0321e0SJeremy L Thompson         pre /= P;
71*0d0321e0SJeremy L Thompson         const CeedScalar *in = d % 2 ? s_buf2 : s_buf1;
72*0d0321e0SJeremy L Thompson         CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buf1 : s_buf2);
73*0d0321e0SJeremy L Thompson 
74*0d0321e0SJeremy L Thompson         // Contract along middle index
75*0d0321e0SJeremy L Thompson         const CeedInt writeLen = pre * post * Q;
76*0d0321e0SJeremy L Thompson         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
77*0d0321e0SJeremy L Thompson           const CeedInt c = k % post;
78*0d0321e0SJeremy L Thompson           const CeedInt j = (k / post) % Q;
79*0d0321e0SJeremy L Thompson           const CeedInt a = k / (post * Q);
80*0d0321e0SJeremy L Thompson 
81*0d0321e0SJeremy L Thompson           CeedScalar vk = 0;
82*0d0321e0SJeremy L Thompson           for (CeedInt b = 0; b < P; b++)
83*0d0321e0SJeremy L Thompson             vk += s_interp1d[j*stride0 + b*stride1] * in[(a*P + b)*post + c];
84*0d0321e0SJeremy L Thompson 
85*0d0321e0SJeremy L Thompson           out[k] = vk;
86*0d0321e0SJeremy L Thompson         }
87*0d0321e0SJeremy L Thompson 
88*0d0321e0SJeremy L Thompson         post *= Q;
89*0d0321e0SJeremy L Thompson       }
90*0d0321e0SJeremy L Thompson     }
91*0d0321e0SJeremy L Thompson   }
92*0d0321e0SJeremy L Thompson }
93*0d0321e0SJeremy L Thompson 
94*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
95*0d0321e0SJeremy L Thompson // Grad
96*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
97*0d0321e0SJeremy L Thompson extern "C" __global__ void grad(const CeedInt nelem, const int transpose,
98*0d0321e0SJeremy L Thompson                                 const CeedScalar *__restrict__ interp1d,
99*0d0321e0SJeremy L Thompson                                 const CeedScalar *__restrict__ grad1d,
100*0d0321e0SJeremy L Thompson                                 const CeedScalar *__restrict__ u,
101*0d0321e0SJeremy L Thompson                                 CeedScalar *__restrict__ v) {
102*0d0321e0SJeremy L Thompson   const CeedInt i = threadIdx.x;
103*0d0321e0SJeremy L Thompson 
104*0d0321e0SJeremy L Thompson   __shared__ CeedScalar s_mem[2 * (BASIS_Q1D * BASIS_P1D + BASIS_BUF_LEN)];
105*0d0321e0SJeremy L Thompson   CeedScalar *s_interp1d = s_mem;
106*0d0321e0SJeremy L Thompson   CeedScalar *s_grad1d = s_interp1d + BASIS_Q1D * BASIS_P1D;
107*0d0321e0SJeremy L Thompson   CeedScalar *s_buf1 = s_grad1d + BASIS_Q1D * BASIS_P1D;
108*0d0321e0SJeremy L Thompson   CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN;
109*0d0321e0SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) {
110*0d0321e0SJeremy L Thompson     s_interp1d[k] = interp1d[k];
111*0d0321e0SJeremy L Thompson     s_grad1d[k] = grad1d[k];
112*0d0321e0SJeremy L Thompson   }
113*0d0321e0SJeremy L Thompson 
114*0d0321e0SJeremy L Thompson   const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D;
115*0d0321e0SJeremy L Thompson   const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D;
116*0d0321e0SJeremy L Thompson   const CeedInt stride0 = transpose ? 1 : BASIS_P1D;
117*0d0321e0SJeremy L Thompson   const CeedInt stride1 = transpose ? BASIS_P1D : 1;
118*0d0321e0SJeremy L Thompson   const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
119*0d0321e0SJeremy L Thompson   const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT;
120*0d0321e0SJeremy L Thompson   const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE);
121*0d0321e0SJeremy L Thompson   const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT);
122*0d0321e0SJeremy L Thompson   const CeedInt u_dim_stride = transpose ? nelem * BASIS_NQPT * BASIS_NCOMP : 0;
123*0d0321e0SJeremy L Thompson   const CeedInt v_dim_stride = transpose ? 0 : nelem * BASIS_NQPT * BASIS_NCOMP;
124*0d0321e0SJeremy L Thompson 
125*0d0321e0SJeremy L Thompson   // Apply basis element by element
126*0d0321e0SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) {
127*0d0321e0SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) {
128*0d0321e0SJeremy L Thompson 
129*0d0321e0SJeremy L Thompson       // dim*dim contractions for grad
130*0d0321e0SJeremy L Thompson       for (CeedInt dim1 = 0; dim1 < BASIS_DIM; dim1++) {
131*0d0321e0SJeremy L Thompson         CeedInt pre = transpose ? BASIS_NQPT : BASIS_ELEMSIZE;
132*0d0321e0SJeremy L Thompson         CeedInt post = 1;
133*0d0321e0SJeremy L Thompson         const CeedScalar *cur_u = u + elem * u_stride + dim1 * u_dim_stride +
134*0d0321e0SJeremy L Thompson                                   comp * u_comp_stride;
135*0d0321e0SJeremy L Thompson         CeedScalar *cur_v = v + elem * v_stride + dim1 * v_dim_stride + comp *
136*0d0321e0SJeremy L Thompson                             v_comp_stride;
137*0d0321e0SJeremy L Thompson         for (CeedInt dim2 = 0; dim2 < BASIS_DIM; dim2++) {
138*0d0321e0SJeremy L Thompson           __syncthreads();
139*0d0321e0SJeremy L Thompson           // Update buffers used
140*0d0321e0SJeremy L Thompson           pre /= P;
141*0d0321e0SJeremy L Thompson           const CeedScalar *op = dim1 == dim2 ? s_grad1d : s_interp1d;
142*0d0321e0SJeremy L Thompson           const CeedScalar *in = dim2 == 0 ? cur_u : (dim2 % 2 ? s_buf2 : s_buf1);
143*0d0321e0SJeremy L Thompson           CeedScalar *out = dim2 == BASIS_DIM - 1 ? cur_v : (dim2 % 2 ? s_buf1 : s_buf2);
144*0d0321e0SJeremy L Thompson 
145*0d0321e0SJeremy L Thompson           // Contract along middle index
146*0d0321e0SJeremy L Thompson           const CeedInt writeLen = pre * post * Q;
147*0d0321e0SJeremy L Thompson           for (CeedInt k = i; k < writeLen; k += blockDim.x) {
148*0d0321e0SJeremy L Thompson             const CeedInt c = k % post;
149*0d0321e0SJeremy L Thompson             const CeedInt j = (k / post) % Q;
150*0d0321e0SJeremy L Thompson             const CeedInt a = k / (post * Q);
151*0d0321e0SJeremy L Thompson             CeedScalar vk = 0;
152*0d0321e0SJeremy L Thompson             for (CeedInt b = 0; b < P; b++)
153*0d0321e0SJeremy L Thompson               vk += op[j * stride0 + b * stride1] * in[(a * P + b) * post + c];
154*0d0321e0SJeremy L Thompson 
155*0d0321e0SJeremy L Thompson             if (transpose && dim2 == BASIS_DIM - 1)
156*0d0321e0SJeremy L Thompson               out[k] += vk;
157*0d0321e0SJeremy L Thompson             else
158*0d0321e0SJeremy L Thompson               out[k] = vk;
159*0d0321e0SJeremy L Thompson           }
160*0d0321e0SJeremy L Thompson 
161*0d0321e0SJeremy L Thompson           post *= Q;
162*0d0321e0SJeremy L Thompson         }
163*0d0321e0SJeremy L Thompson       }
164*0d0321e0SJeremy L Thompson     }
165*0d0321e0SJeremy L Thompson   }
166*0d0321e0SJeremy L Thompson }
167*0d0321e0SJeremy L Thompson 
168*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
169*0d0321e0SJeremy L Thompson // 1D quadrature weights
170*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
171*0d0321e0SJeremy L Thompson __device__ void weight1d(const CeedInt nelem, const CeedScalar *qweight1d,
172*0d0321e0SJeremy L Thompson                          CeedScalar *w) {
173*0d0321e0SJeremy L Thompson   const int i = threadIdx.x;
174*0d0321e0SJeremy L Thompson   if (i < BASIS_Q1D) {
175*0d0321e0SJeremy L Thompson     const size_t elem = blockIdx.x;
176*0d0321e0SJeremy L Thompson     if (elem < nelem)
177*0d0321e0SJeremy L Thompson       w[elem*BASIS_Q1D + i] = qweight1d[i];
178*0d0321e0SJeremy L Thompson   }
179*0d0321e0SJeremy L Thompson }
180*0d0321e0SJeremy L Thompson 
181*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
182*0d0321e0SJeremy L Thompson // 2D quadrature weights
183*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
184*0d0321e0SJeremy L Thompson __device__ void weight2d(const CeedInt nelem, const CeedScalar *qweight1d,
185*0d0321e0SJeremy L Thompson                          CeedScalar *w) {
186*0d0321e0SJeremy L Thompson 
187*0d0321e0SJeremy L Thompson   const int i = threadIdx.x;
188*0d0321e0SJeremy L Thompson   const int j = threadIdx.y;
189*0d0321e0SJeremy L Thompson   if (i < BASIS_Q1D && j < BASIS_Q1D) {
190*0d0321e0SJeremy L Thompson     const size_t elem = blockIdx.x;
191*0d0321e0SJeremy L Thompson     if (elem < nelem) {
192*0d0321e0SJeremy L Thompson       const size_t ind = (elem * BASIS_Q1D + j) * BASIS_Q1D + i;
193*0d0321e0SJeremy L Thompson       w[ind] = qweight1d[i] * qweight1d[j];
194*0d0321e0SJeremy L Thompson     }
195*0d0321e0SJeremy L Thompson   }
196*0d0321e0SJeremy L Thompson }
197*0d0321e0SJeremy L Thompson 
198*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
199*0d0321e0SJeremy L Thompson // 3D quadrature weights
200*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
201*0d0321e0SJeremy L Thompson __device__ void weight3d(const CeedInt nelem, const CeedScalar *qweight1d,
202*0d0321e0SJeremy L Thompson                          CeedScalar *w) {
203*0d0321e0SJeremy L Thompson   const int i = threadIdx.x;
204*0d0321e0SJeremy L Thompson   const int j = threadIdx.y;
205*0d0321e0SJeremy L Thompson   if (i < BASIS_Q1D && j < BASIS_Q1D) {
206*0d0321e0SJeremy L Thompson     const size_t elem = blockIdx.x;
207*0d0321e0SJeremy L Thompson     if (elem < nelem) {
208*0d0321e0SJeremy L Thompson       for (int k=0; k<BASIS_Q1D; k++) {
209*0d0321e0SJeremy L Thompson         const size_t ind = ((elem * BASIS_Q1D + k) * BASIS_Q1D + j) * BASIS_Q1D + i;
210*0d0321e0SJeremy L Thompson         w[ind] = qweight1d[i] * qweight1d[j] * qweight1d[k];
211*0d0321e0SJeremy L Thompson       }
212*0d0321e0SJeremy L Thompson     }
213*0d0321e0SJeremy L Thompson   }
214*0d0321e0SJeremy L Thompson }
215*0d0321e0SJeremy L Thompson 
216*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
217*0d0321e0SJeremy L Thompson // Quadrature weights
218*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
219*0d0321e0SJeremy L Thompson extern "C" __global__ void weight(const CeedInt nelem,
220*0d0321e0SJeremy L Thompson                                   const CeedScalar *__restrict__ qweight1d,
221*0d0321e0SJeremy L Thompson                                   CeedScalar *__restrict__ v) {
222*0d0321e0SJeremy L Thompson   if (BASIS_DIM==1)
223*0d0321e0SJeremy L Thompson     weight1d(nelem, qweight1d, v);
224*0d0321e0SJeremy L Thompson   else if (BASIS_DIM==2)
225*0d0321e0SJeremy L Thompson     weight2d(nelem, qweight1d, v);
226*0d0321e0SJeremy L Thompson   else if (BASIS_DIM==3)
227*0d0321e0SJeremy L Thompson     weight3d(nelem, qweight1d, v);
228*0d0321e0SJeremy L Thompson }
229*0d0321e0SJeremy L Thompson 
230*0d0321e0SJeremy L Thompson );
231*0d0321e0SJeremy L Thompson 
232*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
233*0d0321e0SJeremy L Thompson // Non-Tensor Basis Kernels
234*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
235*0d0321e0SJeremy L Thompson static const char *kernelsNonTensorRef = QUOTE(
236*0d0321e0SJeremy L Thompson 
237*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
238*0d0321e0SJeremy L Thompson // Interp
239*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
240*0d0321e0SJeremy L Thompson extern "C" __global__ void interp(const CeedInt nelem, const int transpose,
241*0d0321e0SJeremy L Thompson                                   const CeedScalar *d_B,
242*0d0321e0SJeremy L Thompson                                   const CeedScalar *__restrict__ d_U,
243*0d0321e0SJeremy L Thompson                                   CeedScalar *__restrict__ d_V) {
244*0d0321e0SJeremy L Thompson   const int tid = threadIdx.x;
245*0d0321e0SJeremy L Thompson 
246*0d0321e0SJeremy L Thompson   const CeedScalar *U;
247*0d0321e0SJeremy L Thompson   CeedScalar V;
248*0d0321e0SJeremy L Thompson   //TODO load B in shared memory if blockDim.z > 1?
249*0d0321e0SJeremy L Thompson 
250*0d0321e0SJeremy L Thompson   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
251*0d0321e0SJeremy L Thompson        elem += gridDim.x*blockDim.z) {
252*0d0321e0SJeremy L Thompson     for (int comp = 0; comp < BASIS_NCOMP; comp++) {
253*0d0321e0SJeremy L Thompson       if (!transpose) { // run with Q threads
254*0d0321e0SJeremy L Thompson         U = d_U + elem*P + comp*nelem*P;
255*0d0321e0SJeremy L Thompson         V = 0.0;
256*0d0321e0SJeremy L Thompson         for (int i = 0; i < P; ++i)
257*0d0321e0SJeremy L Thompson           V += d_B[i + tid*P]*U[i];
258*0d0321e0SJeremy L Thompson 
259*0d0321e0SJeremy L Thompson         d_V[elem*Q + comp*nelem*Q + tid] = V;
260*0d0321e0SJeremy L Thompson       } else { // run with P threads
261*0d0321e0SJeremy L Thompson         U = d_U + elem*Q + comp*nelem*Q;
262*0d0321e0SJeremy L Thompson         V = 0.0;
263*0d0321e0SJeremy L Thompson         for (int i = 0; i < Q; ++i)
264*0d0321e0SJeremy L Thompson           V += d_B[tid + i*P]*U[i];
265*0d0321e0SJeremy L Thompson 
266*0d0321e0SJeremy L Thompson         d_V[elem*P + comp*nelem*P + tid] = V;
267*0d0321e0SJeremy L Thompson       }
268*0d0321e0SJeremy L Thompson     }
269*0d0321e0SJeremy L Thompson   }
270*0d0321e0SJeremy L Thompson }
271*0d0321e0SJeremy L Thompson 
272*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
273*0d0321e0SJeremy L Thompson // Grad
274*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
275*0d0321e0SJeremy L Thompson extern "C" __global__ void grad(const CeedInt nelem, const int transpose,
276*0d0321e0SJeremy L Thompson                                 const CeedScalar *d_G,
277*0d0321e0SJeremy L Thompson                                 const CeedScalar *__restrict__ d_U,
278*0d0321e0SJeremy L Thompson                                 CeedScalar *__restrict__ d_V) {
279*0d0321e0SJeremy L Thompson   const int tid = threadIdx.x;
280*0d0321e0SJeremy L Thompson 
281*0d0321e0SJeremy L Thompson   const CeedScalar *U;
282*0d0321e0SJeremy L Thompson   //TODO load G in shared memory if blockDim.z > 1?
283*0d0321e0SJeremy L Thompson 
284*0d0321e0SJeremy L Thompson   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
285*0d0321e0SJeremy L Thompson        elem += gridDim.x*blockDim.z) {
286*0d0321e0SJeremy L Thompson     for (int comp=0; comp<BASIS_NCOMP; comp++) {
287*0d0321e0SJeremy L Thompson       if (!transpose) { // run with Q threads
288*0d0321e0SJeremy L Thompson         CeedScalar V[BASIS_DIM];
289*0d0321e0SJeremy L Thompson         U = d_U + elem*P + comp*nelem*P;
290*0d0321e0SJeremy L Thompson         for (int dim = 0; dim < BASIS_DIM; dim++)
291*0d0321e0SJeremy L Thompson           V[dim] = 0.0;
292*0d0321e0SJeremy L Thompson 
293*0d0321e0SJeremy L Thompson         for (int i = 0; i < P; ++i) {
294*0d0321e0SJeremy L Thompson           const CeedScalar val = U[i];
295*0d0321e0SJeremy L Thompson           for(int dim = 0; dim < BASIS_DIM; dim++)
296*0d0321e0SJeremy L Thompson             V[dim] += d_G[i + tid*P + dim*P*Q]*val;
297*0d0321e0SJeremy L Thompson         }
298*0d0321e0SJeremy L Thompson         for (int dim = 0; dim < BASIS_DIM; dim++) {
299*0d0321e0SJeremy L Thompson           d_V[elem*Q + comp*nelem*Q + dim*BASIS_NCOMP*nelem*Q + tid] = V[dim];
300*0d0321e0SJeremy L Thompson         }
301*0d0321e0SJeremy L Thompson       } else { // run with P threads
302*0d0321e0SJeremy L Thompson         CeedScalar V = 0.0;
303*0d0321e0SJeremy L Thompson         for (int dim = 0; dim < BASIS_DIM; dim++) {
304*0d0321e0SJeremy L Thompson           U = d_U + elem*Q + comp*nelem*Q +dim*BASIS_NCOMP*nelem*Q;
305*0d0321e0SJeremy L Thompson           for (int i = 0; i < Q; ++i)
306*0d0321e0SJeremy L Thompson             V += d_G[tid + i*P + dim*P*Q]*U[i];
307*0d0321e0SJeremy L Thompson         }
308*0d0321e0SJeremy L Thompson         d_V[elem*P + comp*nelem*P + tid] = V;
309*0d0321e0SJeremy L Thompson       }
310*0d0321e0SJeremy L Thompson     }
311*0d0321e0SJeremy L Thompson   }
312*0d0321e0SJeremy L Thompson }
313*0d0321e0SJeremy L Thompson 
314*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
315*0d0321e0SJeremy L Thompson // Weight
316*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
317*0d0321e0SJeremy L Thompson extern "C" __global__ void weight(const CeedInt nelem,
318*0d0321e0SJeremy L Thompson                                   const CeedScalar *__restrict__ qweight,
319*0d0321e0SJeremy L Thompson                                   CeedScalar *__restrict__ d_V) {
320*0d0321e0SJeremy L Thompson   const int tid = threadIdx.x;
321*0d0321e0SJeremy L Thompson   //TODO load qweight in shared memory if blockDim.z > 1?
322*0d0321e0SJeremy L Thompson   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
323*0d0321e0SJeremy L Thompson        elem += gridDim.x*blockDim.z) {
324*0d0321e0SJeremy L Thompson     d_V[elem*Q + tid] = qweight[tid];
325*0d0321e0SJeremy L Thompson   }
326*0d0321e0SJeremy L Thompson }
327*0d0321e0SJeremy L Thompson 
328*0d0321e0SJeremy L Thompson );
329*0d0321e0SJeremy L Thompson // *INDENT-ON*
330*0d0321e0SJeremy L Thompson 
331*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
332*0d0321e0SJeremy L Thompson // Basis apply - tensor
333*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
334*0d0321e0SJeremy L Thompson int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt nelem,
335*0d0321e0SJeremy L Thompson                         CeedTransposeMode tmode,
336*0d0321e0SJeremy L Thompson                         CeedEvalMode emode, CeedVector u, CeedVector v) {
337*0d0321e0SJeremy L Thompson   int ierr;
338*0d0321e0SJeremy L Thompson   Ceed ceed;
339*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
340*0d0321e0SJeremy L Thompson   Ceed_Cuda *ceed_Cuda;
341*0d0321e0SJeremy L Thompson   ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr);
342*0d0321e0SJeremy L Thompson   CeedBasis_Cuda *data;
343*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
344*0d0321e0SJeremy L Thompson   const CeedInt transpose = tmode == CEED_TRANSPOSE;
345*0d0321e0SJeremy L Thompson   const int maxblocksize = 32;
346*0d0321e0SJeremy L Thompson 
347*0d0321e0SJeremy L Thompson   // Read vectors
348*0d0321e0SJeremy L Thompson   const CeedScalar *d_u;
349*0d0321e0SJeremy L Thompson   CeedScalar *d_v;
350*0d0321e0SJeremy L Thompson   if (emode != CEED_EVAL_WEIGHT) {
351*0d0321e0SJeremy L Thompson     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
352*0d0321e0SJeremy L Thompson   }
353*0d0321e0SJeremy L Thompson   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
354*0d0321e0SJeremy L Thompson 
355*0d0321e0SJeremy L Thompson   // Clear v for transpose operation
356*0d0321e0SJeremy L Thompson   if (tmode == CEED_TRANSPOSE) {
357*0d0321e0SJeremy L Thompson     CeedInt length;
358*0d0321e0SJeremy L Thompson     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
359*0d0321e0SJeremy L Thompson     ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar));
360*0d0321e0SJeremy L Thompson     CeedChk_Cu(ceed,ierr);
361*0d0321e0SJeremy L Thompson   }
362*0d0321e0SJeremy L Thompson   CeedInt Q1d, dim;
363*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
364*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
365*0d0321e0SJeremy L Thompson 
366*0d0321e0SJeremy L Thompson   // Basis action
367*0d0321e0SJeremy L Thompson   switch (emode) {
368*0d0321e0SJeremy L Thompson   case CEED_EVAL_INTERP: {
369*0d0321e0SJeremy L Thompson     void *interpargs[] = {(void *) &nelem, (void *) &transpose,
370*0d0321e0SJeremy L Thompson                           &data->d_interp1d, &d_u, &d_v
371*0d0321e0SJeremy L Thompson                          };
372*0d0321e0SJeremy L Thompson     CeedInt blocksize = CeedIntPow(Q1d, dim);
373*0d0321e0SJeremy L Thompson     blocksize = blocksize > maxblocksize ? maxblocksize : blocksize;
374*0d0321e0SJeremy L Thompson 
375*0d0321e0SJeremy L Thompson     ierr = CeedRunKernelCuda(ceed, data->interp, nelem, blocksize, interpargs);
376*0d0321e0SJeremy L Thompson     CeedChkBackend(ierr);
377*0d0321e0SJeremy L Thompson   } break;
378*0d0321e0SJeremy L Thompson   case CEED_EVAL_GRAD: {
379*0d0321e0SJeremy L Thompson     void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_interp1d,
380*0d0321e0SJeremy L Thompson                         &data->d_grad1d, &d_u, &d_v
381*0d0321e0SJeremy L Thompson                        };
382*0d0321e0SJeremy L Thompson     CeedInt blocksize = maxblocksize;
383*0d0321e0SJeremy L Thompson 
384*0d0321e0SJeremy L Thompson     ierr = CeedRunKernelCuda(ceed, data->grad, nelem, blocksize, gradargs);
385*0d0321e0SJeremy L Thompson     CeedChkBackend(ierr);
386*0d0321e0SJeremy L Thompson   } break;
387*0d0321e0SJeremy L Thompson   case CEED_EVAL_WEIGHT: {
388*0d0321e0SJeremy L Thompson     void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight1d, &d_v};
389*0d0321e0SJeremy L Thompson     const int gridsize = nelem;
390*0d0321e0SJeremy L Thompson     ierr = CeedRunKernelDimCuda(ceed, data->weight, gridsize,
391*0d0321e0SJeremy L Thompson                                 Q1d, dim >= 2 ? Q1d : 1, 1,
392*0d0321e0SJeremy L Thompson                                 weightargs); CeedChkBackend(ierr);
393*0d0321e0SJeremy L Thompson   } break;
394*0d0321e0SJeremy L Thompson   // LCOV_EXCL_START
395*0d0321e0SJeremy L Thompson   // Evaluate the divergence to/from the quadrature points
396*0d0321e0SJeremy L Thompson   case CEED_EVAL_DIV:
397*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
398*0d0321e0SJeremy L Thompson   // Evaluate the curl to/from the quadrature points
399*0d0321e0SJeremy L Thompson   case CEED_EVAL_CURL:
400*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
401*0d0321e0SJeremy L Thompson   // Take no action, BasisApply should not have been called
402*0d0321e0SJeremy L Thompson   case CEED_EVAL_NONE:
403*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
404*0d0321e0SJeremy L Thompson                      "CEED_EVAL_NONE does not make sense in this context");
405*0d0321e0SJeremy L Thompson     // LCOV_EXCL_STOP
406*0d0321e0SJeremy L Thompson   }
407*0d0321e0SJeremy L Thompson 
408*0d0321e0SJeremy L Thompson   // Restore vectors
409*0d0321e0SJeremy L Thompson   if (emode != CEED_EVAL_WEIGHT) {
410*0d0321e0SJeremy L Thompson     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
411*0d0321e0SJeremy L Thompson   }
412*0d0321e0SJeremy L Thompson   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
413*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
414*0d0321e0SJeremy L Thompson }
415*0d0321e0SJeremy L Thompson 
416*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
417*0d0321e0SJeremy L Thompson // Basis apply - non-tensor
418*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
419*0d0321e0SJeremy L Thompson int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt nelem,
420*0d0321e0SJeremy L Thompson                                  CeedTransposeMode tmode, CeedEvalMode emode,
421*0d0321e0SJeremy L Thompson                                  CeedVector u, CeedVector v) {
422*0d0321e0SJeremy L Thompson   int ierr;
423*0d0321e0SJeremy L Thompson   Ceed ceed;
424*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
425*0d0321e0SJeremy L Thompson   Ceed_Cuda *ceed_Cuda;
426*0d0321e0SJeremy L Thompson   ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr);
427*0d0321e0SJeremy L Thompson   CeedBasisNonTensor_Cuda *data;
428*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
429*0d0321e0SJeremy L Thompson   CeedInt nnodes, nqpt;
430*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr);
431*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetNumNodes(basis, &nnodes); CeedChkBackend(ierr);
432*0d0321e0SJeremy L Thompson   const CeedInt transpose = tmode == CEED_TRANSPOSE;
433*0d0321e0SJeremy L Thompson   int elemsPerBlock = 1;
434*0d0321e0SJeremy L Thompson   int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0);
435*0d0321e0SJeremy L Thompson 
436*0d0321e0SJeremy L Thompson   // Read vectors
437*0d0321e0SJeremy L Thompson   const CeedScalar *d_u;
438*0d0321e0SJeremy L Thompson   CeedScalar *d_v;
439*0d0321e0SJeremy L Thompson   if (emode != CEED_EVAL_WEIGHT) {
440*0d0321e0SJeremy L Thompson     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
441*0d0321e0SJeremy L Thompson   }
442*0d0321e0SJeremy L Thompson   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
443*0d0321e0SJeremy L Thompson 
444*0d0321e0SJeremy L Thompson   // Clear v for transpose operation
445*0d0321e0SJeremy L Thompson   if (tmode == CEED_TRANSPOSE) {
446*0d0321e0SJeremy L Thompson     CeedInt length;
447*0d0321e0SJeremy L Thompson     ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr);
448*0d0321e0SJeremy L Thompson     ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar));
449*0d0321e0SJeremy L Thompson     CeedChk_Cu(ceed, ierr);
450*0d0321e0SJeremy L Thompson   }
451*0d0321e0SJeremy L Thompson 
452*0d0321e0SJeremy L Thompson   // Apply basis operation
453*0d0321e0SJeremy L Thompson   switch (emode) {
454*0d0321e0SJeremy L Thompson   case CEED_EVAL_INTERP: {
455*0d0321e0SJeremy L Thompson     void *interpargs[] = {(void *) &nelem, (void *) &transpose,
456*0d0321e0SJeremy L Thompson                           &data->d_interp, &d_u, &d_v
457*0d0321e0SJeremy L Thompson                          };
458*0d0321e0SJeremy L Thompson     if (!transpose) {
459*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelDimCuda(ceed, data->interp, grid, nqpt, 1,
460*0d0321e0SJeremy L Thompson                                   elemsPerBlock, interpargs); CeedChkBackend(ierr);
461*0d0321e0SJeremy L Thompson     } else {
462*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelDimCuda(ceed, data->interp, grid, nnodes, 1,
463*0d0321e0SJeremy L Thompson                                   elemsPerBlock, interpargs); CeedChkBackend(ierr);
464*0d0321e0SJeremy L Thompson     }
465*0d0321e0SJeremy L Thompson   } break;
466*0d0321e0SJeremy L Thompson   case CEED_EVAL_GRAD: {
467*0d0321e0SJeremy L Thompson     void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_grad,
468*0d0321e0SJeremy L Thompson                         &d_u, &d_v
469*0d0321e0SJeremy L Thompson                        };
470*0d0321e0SJeremy L Thompson     if (!transpose) {
471*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelDimCuda(ceed, data->grad, grid, nqpt, 1,
472*0d0321e0SJeremy L Thompson                                   elemsPerBlock, gradargs); CeedChkBackend(ierr);
473*0d0321e0SJeremy L Thompson     } else {
474*0d0321e0SJeremy L Thompson       ierr = CeedRunKernelDimCuda(ceed, data->grad, grid, nnodes, 1,
475*0d0321e0SJeremy L Thompson                                   elemsPerBlock, gradargs); CeedChkBackend(ierr);
476*0d0321e0SJeremy L Thompson     }
477*0d0321e0SJeremy L Thompson   } break;
478*0d0321e0SJeremy L Thompson   case CEED_EVAL_WEIGHT: {
479*0d0321e0SJeremy L Thompson     void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight, &d_v};
480*0d0321e0SJeremy L Thompson     ierr = CeedRunKernelDimCuda(ceed, data->weight, grid, nqpt, 1,
481*0d0321e0SJeremy L Thompson                                 elemsPerBlock, weightargs); CeedChkBackend(ierr);
482*0d0321e0SJeremy L Thompson   } break;
483*0d0321e0SJeremy L Thompson   // LCOV_EXCL_START
484*0d0321e0SJeremy L Thompson   // Evaluate the divergence to/from the quadrature points
485*0d0321e0SJeremy L Thompson   case CEED_EVAL_DIV:
486*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
487*0d0321e0SJeremy L Thompson   // Evaluate the curl to/from the quadrature points
488*0d0321e0SJeremy L Thompson   case CEED_EVAL_CURL:
489*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
490*0d0321e0SJeremy L Thompson   // Take no action, BasisApply should not have been called
491*0d0321e0SJeremy L Thompson   case CEED_EVAL_NONE:
492*0d0321e0SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
493*0d0321e0SJeremy L Thompson                      "CEED_EVAL_NONE does not make sense in this context");
494*0d0321e0SJeremy L Thompson     // LCOV_EXCL_STOP
495*0d0321e0SJeremy L Thompson   }
496*0d0321e0SJeremy L Thompson 
497*0d0321e0SJeremy L Thompson   // Restore vectors
498*0d0321e0SJeremy L Thompson   if (emode != CEED_EVAL_WEIGHT) {
499*0d0321e0SJeremy L Thompson     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
500*0d0321e0SJeremy L Thompson   }
501*0d0321e0SJeremy L Thompson   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
502*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
503*0d0321e0SJeremy L Thompson }
504*0d0321e0SJeremy L Thompson 
505*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
506*0d0321e0SJeremy L Thompson // Destroy tensor basis
507*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
508*0d0321e0SJeremy L Thompson static int CeedBasisDestroy_Cuda(CeedBasis basis) {
509*0d0321e0SJeremy L Thompson   int ierr;
510*0d0321e0SJeremy L Thompson   Ceed ceed;
511*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
512*0d0321e0SJeremy L Thompson 
513*0d0321e0SJeremy L Thompson   CeedBasis_Cuda *data;
514*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
515*0d0321e0SJeremy L Thompson 
516*0d0321e0SJeremy L Thompson   CeedChk_Cu(ceed, cuModuleUnload(data->module));
517*0d0321e0SJeremy L Thompson 
518*0d0321e0SJeremy L Thompson   ierr = cudaFree(data->d_qweight1d); CeedChk_Cu(ceed,ierr);
519*0d0321e0SJeremy L Thompson   ierr = cudaFree(data->d_interp1d); CeedChk_Cu(ceed,ierr);
520*0d0321e0SJeremy L Thompson   ierr = cudaFree(data->d_grad1d); CeedChk_Cu(ceed,ierr);
521*0d0321e0SJeremy L Thompson 
522*0d0321e0SJeremy L Thompson   ierr = CeedFree(&data); CeedChkBackend(ierr);
523*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
524*0d0321e0SJeremy L Thompson }
525*0d0321e0SJeremy L Thompson 
526*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
527*0d0321e0SJeremy L Thompson // Destroy non-tensor basis
528*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
529*0d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) {
530*0d0321e0SJeremy L Thompson   int ierr;
531*0d0321e0SJeremy L Thompson   Ceed ceed;
532*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
533*0d0321e0SJeremy L Thompson 
534*0d0321e0SJeremy L Thompson   CeedBasisNonTensor_Cuda *data;
535*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
536*0d0321e0SJeremy L Thompson 
537*0d0321e0SJeremy L Thompson   CeedChk_Cu(ceed, cuModuleUnload(data->module));
538*0d0321e0SJeremy L Thompson 
539*0d0321e0SJeremy L Thompson   ierr = cudaFree(data->d_qweight); CeedChk_Cu(ceed, ierr);
540*0d0321e0SJeremy L Thompson   ierr = cudaFree(data->d_interp); CeedChk_Cu(ceed, ierr);
541*0d0321e0SJeremy L Thompson   ierr = cudaFree(data->d_grad); CeedChk_Cu(ceed, ierr);
542*0d0321e0SJeremy L Thompson 
543*0d0321e0SJeremy L Thompson   ierr = CeedFree(&data); CeedChkBackend(ierr);
544*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
545*0d0321e0SJeremy L Thompson }
546*0d0321e0SJeremy L Thompson 
547*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
548*0d0321e0SJeremy L Thompson // Create tensor
549*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
550*0d0321e0SJeremy L Thompson int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P1d, CeedInt Q1d,
551*0d0321e0SJeremy L Thompson                                  const CeedScalar *interp1d,
552*0d0321e0SJeremy L Thompson                                  const CeedScalar *grad1d,
553*0d0321e0SJeremy L Thompson                                  const CeedScalar *qref1d,
554*0d0321e0SJeremy L Thompson                                  const CeedScalar *qweight1d,
555*0d0321e0SJeremy L Thompson                                  CeedBasis basis) {
556*0d0321e0SJeremy L Thompson   int ierr;
557*0d0321e0SJeremy L Thompson   Ceed ceed;
558*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
559*0d0321e0SJeremy L Thompson   CeedBasis_Cuda *data;
560*0d0321e0SJeremy L Thompson   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
561*0d0321e0SJeremy L Thompson 
562*0d0321e0SJeremy L Thompson   // Copy data to GPU
563*0d0321e0SJeremy L Thompson   const CeedInt qBytes = Q1d * sizeof(CeedScalar);
564*0d0321e0SJeremy L Thompson   ierr = cudaMalloc((void **)&data->d_qweight1d, qBytes); CeedChk_Cu(ceed,ierr);
565*0d0321e0SJeremy L Thompson   ierr = cudaMemcpy(data->d_qweight1d, qweight1d, qBytes,
566*0d0321e0SJeremy L Thompson                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed,ierr);
567*0d0321e0SJeremy L Thompson 
568*0d0321e0SJeremy L Thompson   const CeedInt iBytes = qBytes * P1d;
569*0d0321e0SJeremy L Thompson   ierr = cudaMalloc((void **)&data->d_interp1d, iBytes); CeedChk_Cu(ceed,ierr);
570*0d0321e0SJeremy L Thompson   ierr = cudaMemcpy(data->d_interp1d, interp1d, iBytes,
571*0d0321e0SJeremy L Thompson                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed,ierr);
572*0d0321e0SJeremy L Thompson 
573*0d0321e0SJeremy L Thompson   ierr = cudaMalloc((void **)&data->d_grad1d, iBytes); CeedChk_Cu(ceed,ierr);
574*0d0321e0SJeremy L Thompson   ierr = cudaMemcpy(data->d_grad1d, grad1d, iBytes,
575*0d0321e0SJeremy L Thompson                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed,ierr);
576*0d0321e0SJeremy L Thompson 
577*0d0321e0SJeremy L Thompson   // Complie basis kernels
578*0d0321e0SJeremy L Thompson   CeedInt ncomp;
579*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
580*0d0321e0SJeremy L Thompson   ierr = CeedCompileCuda(ceed, basiskernels, &data->module, 7,
581*0d0321e0SJeremy L Thompson                          "BASIS_Q1D", Q1d,
582*0d0321e0SJeremy L Thompson                          "BASIS_P1D", P1d,
583*0d0321e0SJeremy L Thompson                          "BASIS_BUF_LEN", ncomp * CeedIntPow(Q1d > P1d ?
584*0d0321e0SJeremy L Thompson                              Q1d : P1d, dim),
585*0d0321e0SJeremy L Thompson                          "BASIS_DIM", dim,
586*0d0321e0SJeremy L Thompson                          "BASIS_NCOMP", ncomp,
587*0d0321e0SJeremy L Thompson                          "BASIS_ELEMSIZE", CeedIntPow(P1d, dim),
588*0d0321e0SJeremy L Thompson                          "BASIS_NQPT", CeedIntPow(Q1d, dim)
589*0d0321e0SJeremy L Thompson                         ); CeedChkBackend(ierr);
590*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelCuda(ceed, data->module, "interp", &data->interp);
591*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
592*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelCuda(ceed, data->module, "grad", &data->grad);
593*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
594*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelCuda(ceed, data->module, "weight", &data->weight);
595*0d0321e0SJeremy L Thompson   CeedChkBackend(ierr);
596*0d0321e0SJeremy L Thompson   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
597*0d0321e0SJeremy L Thompson 
598*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
599*0d0321e0SJeremy L Thompson                                 CeedBasisApply_Cuda); CeedChkBackend(ierr);
600*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
601*0d0321e0SJeremy L Thompson                                 CeedBasisDestroy_Cuda); CeedChkBackend(ierr);
602*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
603*0d0321e0SJeremy L Thompson }
604*0d0321e0SJeremy L Thompson 
605*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
606*0d0321e0SJeremy L Thompson // Create non-tensor
607*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
608*0d0321e0SJeremy L Thompson int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt nnodes,
609*0d0321e0SJeremy L Thompson                            CeedInt nqpts, const CeedScalar *interp,
610*0d0321e0SJeremy L Thompson                            const CeedScalar *grad, const CeedScalar *qref,
611*0d0321e0SJeremy L Thompson                            const CeedScalar *qweight, CeedBasis basis) {
612*0d0321e0SJeremy L Thompson   int ierr;
613*0d0321e0SJeremy L Thompson   Ceed ceed;
614*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
615*0d0321e0SJeremy L Thompson   CeedBasisNonTensor_Cuda *data;
616*0d0321e0SJeremy L Thompson   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
617*0d0321e0SJeremy L Thompson 
618*0d0321e0SJeremy L Thompson   // Copy basis data to GPU
619*0d0321e0SJeremy L Thompson   const CeedInt qBytes = nqpts * sizeof(CeedScalar);
620*0d0321e0SJeremy L Thompson   ierr = cudaMalloc((void **)&data->d_qweight, qBytes); CeedChk_Cu(ceed, ierr);
621*0d0321e0SJeremy L Thompson   ierr = cudaMemcpy(data->d_qweight, qweight, qBytes,
622*0d0321e0SJeremy L Thompson                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
623*0d0321e0SJeremy L Thompson 
624*0d0321e0SJeremy L Thompson   const CeedInt iBytes = qBytes * nnodes;
625*0d0321e0SJeremy L Thompson   ierr = cudaMalloc((void **)&data->d_interp, iBytes); CeedChk_Cu(ceed, ierr);
626*0d0321e0SJeremy L Thompson   ierr = cudaMemcpy(data->d_interp, interp, iBytes,
627*0d0321e0SJeremy L Thompson                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
628*0d0321e0SJeremy L Thompson 
629*0d0321e0SJeremy L Thompson   const CeedInt gBytes = qBytes * nnodes * dim;
630*0d0321e0SJeremy L Thompson   ierr = cudaMalloc((void **)&data->d_grad, gBytes); CeedChk_Cu(ceed, ierr);
631*0d0321e0SJeremy L Thompson   ierr = cudaMemcpy(data->d_grad, grad, gBytes,
632*0d0321e0SJeremy L Thompson                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
633*0d0321e0SJeremy L Thompson 
634*0d0321e0SJeremy L Thompson   // Compile basis kernels
635*0d0321e0SJeremy L Thompson   CeedInt ncomp;
636*0d0321e0SJeremy L Thompson   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr);
637*0d0321e0SJeremy L Thompson   ierr = CeedCompileCuda(ceed, kernelsNonTensorRef, &data->module, 4,
638*0d0321e0SJeremy L Thompson                          "Q", nqpts,
639*0d0321e0SJeremy L Thompson                          "P", nnodes,
640*0d0321e0SJeremy L Thompson                          "BASIS_DIM", dim,
641*0d0321e0SJeremy L Thompson                          "BASIS_NCOMP", ncomp
642*0d0321e0SJeremy L Thompson                         ); CeedChk_Cu(ceed, ierr);
643*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelCuda(ceed, data->module, "interp", &data->interp);
644*0d0321e0SJeremy L Thompson   CeedChk_Cu(ceed, ierr);
645*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelCuda(ceed, data->module, "grad", &data->grad);
646*0d0321e0SJeremy L Thompson   CeedChk_Cu(ceed, ierr);
647*0d0321e0SJeremy L Thompson   ierr = CeedGetKernelCuda(ceed, data->module, "weight", &data->weight);
648*0d0321e0SJeremy L Thompson   CeedChk_Cu(ceed, ierr);
649*0d0321e0SJeremy L Thompson 
650*0d0321e0SJeremy L Thompson   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
651*0d0321e0SJeremy L Thompson 
652*0d0321e0SJeremy L Thompson   // Register backend functions
653*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
654*0d0321e0SJeremy L Thompson                                 CeedBasisApplyNonTensor_Cuda); CeedChkBackend(ierr);
655*0d0321e0SJeremy L Thompson   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
656*0d0321e0SJeremy L Thompson                                 CeedBasisDestroyNonTensor_Cuda); CeedChkBackend(ierr);
657*0d0321e0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
658*0d0321e0SJeremy L Thompson }
659*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
660