xref: /libCEED/backends/cuda-shared/ceed-cuda-shared-basis.c (revision c042f62f62e10e1321eb699b116e67a6568d5716)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed-backend.h>
18 #include <ceed.h>
19 #include "ceed-cuda-shared.h"
20 #include "../cuda/ceed-cuda.h"
21 
22 //*********************
23 // shared mem kernels
24 static const char *kernelsShared = QUOTE(
25 
26 inline __device__ void add(CeedScalar *r_V, const CeedScalar *r_U) {
27   for (int i = 0; i < Q1D; i++)
28     r_V[i] += r_U[i];
29 }
30 
31 //////////
32 //  1D  //
33 //////////
34 
35 inline __device__ void readDofs1d(const int elem, const int tidx,
36                                   const int tidy, const int tidz,const int comp,
37                                   const int nelem, const CeedScalar *d_U, CeedScalar *slice) {
38   for (int i = 0; i < P1D; i++)
39     slice[i+tidz*Q1D] = d_U[i + comp*P1D + elem*BASIS_NCOMP*P1D];
40   for (int i = P1D; i < Q1D; i++)
41     slice[i+tidz*Q1D] = 0.0;
42 }
43 
44 inline __device__ void writeDofs1d(const int elem, const int tidx,
45                                    const int tidy, const int comp,
46                                    const int nelem, const CeedScalar &r_V,
47                                    CeedScalar *d_V) {
48   if (tidx<P1D) {
49     d_V[tidx + comp*P1D + elem*BASIS_NCOMP*P1D] = r_V;
50   }
51 }
52 
53 inline __device__ void readQuads1d(const int elem, const int tidx,
54                                    const int tidy, const int tidz, const int comp,
55                                    const int dim, const int nelem,
56                                    const CeedScalar *d_U, CeedScalar *slice) {
57   for (int i = 0; i < Q1D; i++)
58     slice[i+tidz*Q1D] = d_U[i + elem*Q1D + comp*Q1D*nelem +
59                             dim*BASIS_NCOMP*nelem*Q1D];
60 }
61 
62 inline __device__ void writeQuads1d(const int elem, const int tidx,
63                                     const int tidy, const int comp,
64                                     const int dim, const int nelem,
65                                     const CeedScalar &r_V, CeedScalar *d_V) {
66   d_V[tidx + elem*Q1D + comp*Q1D*nelem + dim*BASIS_NCOMP*nelem*Q1D] = r_V;
67 }
68 
69 inline __device__ void ContractX1d(CeedScalar *slice, const int tidx,
70                                    const int tidy, const int tidz,
71                                    const CeedScalar &U, const CeedScalar *B,
72                                    CeedScalar &V) {
73   V = 0.0;
74   for (int i = 0; i < P1D; ++i) {
75     V += B[i + tidx*P1D] * slice[i+tidz*Q1D];//contract x direction
76   }
77 }
78 
79 inline __device__ void ContractTransposeX1d(CeedScalar *slice, const int tidx,
80     const int tidy, const int tidz,
81     const CeedScalar &U, const CeedScalar *B, CeedScalar &V) {
82   V = 0.0;
83   for (int i = 0; i < Q1D; ++i) {
84     V += B[tidx + i*P1D] * slice[i+tidz*Q1D];//contract x direction
85   }
86 }
87 
88 inline __device__ void interp1d(const CeedInt nelem, const int transpose,
89                                 const CeedScalar *c_B,
90                                 const CeedScalar *__restrict__ d_U,
91                                 CeedScalar *__restrict__ d_V,
92                                 CeedScalar *slice) {
93   CeedScalar r_V;
94   CeedScalar r_t;
95 
96   const int tidx = threadIdx.x;
97   const int tidy = threadIdx.y;
98   const int tidz = threadIdx.z;
99 
100 
101   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
102        elem += gridDim.x*blockDim.z) {
103     for(int comp=0; comp<BASIS_NCOMP; comp++) {
104       if(!transpose) {
105         readDofs1d(elem, tidx, tidy, tidz, comp, nelem, d_U, slice);
106         ContractX1d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
107         writeQuads1d(elem, tidx, tidy, comp, 0, nelem, r_V, d_V);
108       } else {
109         readQuads1d(elem, tidx, tidy, tidz, comp, 0, nelem, d_U, slice);
110         ContractTransposeX1d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
111         writeDofs1d(elem, tidx, tidy, comp, nelem, r_V, d_V);
112       }
113     }
114   }
115 }
116 
117 inline __device__ void grad1d(const CeedInt nelem, const int transpose,
118                               const CeedScalar *c_B, const CeedScalar *c_G,
119                               const CeedScalar *__restrict__ d_U,
120                               CeedScalar *__restrict__ d_V,
121                               CeedScalar *slice) {
122   CeedScalar r_U;
123   CeedScalar r_V;
124 
125   const int tidx = threadIdx.x;
126   const int tidy = threadIdx.y;
127   const int tidz = threadIdx.z;
128   int dim;
129 
130   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
131        elem += gridDim.x*blockDim.z) {
132     for(int comp=0; comp<BASIS_NCOMP; comp++) {
133       if(!transpose) {
134         readDofs1d(elem, tidx, tidy, tidz, comp, nelem, d_U, slice);
135         ContractX1d(slice, tidx, tidy, tidz, r_U, c_G, r_V);
136         dim = 0;
137         writeQuads1d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
138       } else {
139         dim = 0;
140         readQuads1d(elem, tidx, tidy, tidz, comp, dim, nelem, d_U, slice);
141         ContractTransposeX1d(slice, tidx, tidy, tidz, r_U, c_G, r_V);
142         writeDofs1d(elem, tidx, tidy, comp, nelem, r_V, d_V);
143       }
144     }
145   }
146 }
147 //////////
148 //  2D  //
149 //////////
150 
151 inline __device__ void readDofs2d(const int elem, const int tidx,
152                                   const int tidy, const int comp,
153                                   const int nelem, const CeedScalar *d_U,
154                                   CeedScalar &U) {
155   U = (tidx<P1D
156        && tidy<P1D) ? d_U[tidx + tidy*P1D + comp*P1D*P1D + elem*BASIS_NCOMP*P1D*P1D ] :
157       0.0;
158 }
159 
160 inline __device__ void writeDofs2d(const int elem, const int tidx,
161                                    const int tidy, const int comp,
162                                    const int nelem, const CeedScalar &r_V,
163                                    CeedScalar *d_V) {
164   if (tidx<P1D && tidy<P1D) {
165     d_V[tidx + tidy*P1D + comp*P1D*P1D + elem*BASIS_NCOMP*P1D*P1D ] = r_V;
166   }
167 }
168 
169 inline __device__ void readQuads2d(const int elem, const int tidx,
170                                    const int tidy, const int comp,
171                                    const int dim, const int nelem,
172                                    const CeedScalar *d_U, CeedScalar &U ) {
173   U = d_U[tidx + tidy*Q1D + elem*Q1D*Q1D + comp*Q1D*Q1D*nelem +
174                dim*BASIS_NCOMP*nelem*Q1D*Q1D];
175 }
176 
177 inline __device__ void writeQuads2d(const int elem, const int tidx,
178                                     const int tidy, const int comp,
179                                     const int dim, const int nelem,
180                                     const CeedScalar &r_V, CeedScalar *d_V) {
181   d_V[tidx + tidy*Q1D + elem*Q1D*Q1D + comp*Q1D*Q1D*nelem +
182            dim*BASIS_NCOMP*nelem*Q1D*Q1D ] = r_V;
183 }
184 
185 inline __device__ void ContractX2d(CeedScalar *slice, const int tidx,
186                                    const int tidy, const int tidz,
187                                    const CeedScalar &U, const CeedScalar *B,
188                                    CeedScalar &V) {
189   slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U;
190   __syncthreads();
191   V = 0.0;
192   for (int i = 0; i < P1D; ++i) {
193     V += B[i + tidx*P1D] * slice[i + tidy*Q1D + tidz*Q1D*Q1D];//contract x direction
194   }
195   __syncthreads();
196 }
197 
198 inline __device__ void ContractY2d(CeedScalar *slice, const int tidx,
199                                    const int tidy, const int tidz,
200                                    const CeedScalar &U, const CeedScalar *B,
201                                    CeedScalar &V) {
202   slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U;
203   __syncthreads();
204   V = 0.0;
205   for (int i = 0; i < P1D; ++i) {
206     V += B[i + tidy*P1D] * slice[tidx + i*Q1D + tidz*Q1D*Q1D];//contract y direction
207   }
208   __syncthreads();
209 }
210 
211 inline __device__ void ContractTransposeY2d(CeedScalar *slice, const int tidx,
212     const int tidy, const int tidz,
213     const CeedScalar &U, const CeedScalar *B, CeedScalar &V) {
214   slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U;
215   __syncthreads();
216   V = 0.0;
217   if (tidy<P1D) {
218     for (int i = 0; i < Q1D; ++i) {
219       V += B[tidy + i*P1D] * slice[tidx + i*Q1D + tidz*Q1D*Q1D];//contract y direction
220     }
221   }
222   __syncthreads();
223 }
224 
225 inline __device__ void ContractTransposeX2d(CeedScalar *slice, const int tidx,
226     const int tidy, const int tidz,
227     const CeedScalar &U, const CeedScalar *B, CeedScalar &V) {
228   slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U;
229   __syncthreads();
230   V = 0.0;
231   if (tidx<P1D) {
232     for (int i = 0; i < Q1D; ++i) {
233       V += B[tidx + i*P1D] * slice[i + tidy*Q1D + tidz*Q1D*Q1D];//contract x direction
234     }
235   }
236   __syncthreads();
237 }
238 
239 inline __device__ void interp2d(const CeedInt nelem, const int transpose,
240                                 const CeedScalar *c_B,
241                                 const CeedScalar *__restrict__ d_U,
242                                 CeedScalar *__restrict__ d_V,
243                                 CeedScalar *slice) {
244   CeedScalar r_V;
245   CeedScalar r_t;
246 
247   const int tidx = threadIdx.x;
248   const int tidy = threadIdx.y;
249   const int tidz = threadIdx.z;
250   const int blockElem = tidz/BASIS_NCOMP;
251   const int elemsPerBlock = blockDim.z/BASIS_NCOMP;
252   const int comp = tidz%BASIS_NCOMP;
253 
254   for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem;
255        elem += gridDim.x*elemsPerBlock) {
256     const int comp = tidz%BASIS_NCOMP;
257     r_V = 0.0;
258     r_t = 0.0;
259     if(!transpose) {
260       readDofs2d(elem, tidx, tidy, comp, nelem, d_U, r_V);
261       ContractX2d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
262       ContractY2d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
263       writeQuads2d(elem, tidx, tidy, comp, 0, nelem, r_V, d_V);
264     } else {
265       readQuads2d(elem, tidx, tidy, comp, 0, nelem, d_U, r_V);
266       ContractTransposeY2d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
267       ContractTransposeX2d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
268       writeDofs2d(elem, tidx, tidy, comp, nelem, r_V, d_V);
269     }
270   }
271 }
272 
273 inline __device__ void grad2d(const CeedInt nelem, const int transpose,
274                               const CeedScalar *c_B, const CeedScalar *c_G,
275                               const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V,
276                               CeedScalar *slice) {
277   CeedScalar r_U;
278   CeedScalar r_V;
279   CeedScalar r_t;
280 
281   const int tidx = threadIdx.x;
282   const int tidy = threadIdx.y;
283   const int tidz = threadIdx.z;
284   const int blockElem = tidz/BASIS_NCOMP;
285   const int elemsPerBlock = blockDim.z/BASIS_NCOMP;
286   const int comp = tidz%BASIS_NCOMP;
287   int dim;
288 
289   for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem;
290        elem += gridDim.x*elemsPerBlock) {
291     if(!transpose) {
292       readDofs2d(elem, tidx, tidy, comp, nelem, d_U, r_U);
293       ContractX2d(slice, tidx, tidy, tidz, r_U, c_G, r_t);
294       ContractY2d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
295       dim = 0;
296       writeQuads2d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
297       ContractX2d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
298       ContractY2d(slice, tidx, tidy, tidz, r_t, c_G, r_V);
299       dim = 1;
300       writeQuads2d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
301     } else {
302       dim = 0;
303       readQuads2d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U);
304       ContractTransposeY2d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
305       ContractTransposeX2d(slice, tidx, tidy, tidz, r_t, c_G, r_V);
306       dim = 1;
307       readQuads2d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U);
308       ContractTransposeY2d(slice, tidx, tidy, tidz, r_U, c_G, r_t);
309       ContractTransposeX2d(slice, tidx, tidy, tidz, r_t, c_B, r_U);
310       r_V+=r_U;
311       writeDofs2d(elem, tidx, tidy, comp, nelem, r_V, d_V);
312     }
313   }
314 }
315 //////////
316 //  3D  //
317 //////////
318 
319 inline __device__ void readDofs3d(const int elem, const int tidx,
320                                   const int tidy, const int comp,
321                                   const int nelem, const CeedScalar *d_U, CeedScalar *r_U) {
322   for (int i = 0; i < P1D; i++)
323     r_U[i] = (tidx<P1D
324               && tidy<P1D) ? d_U[tidx + tidy*P1D + i*P1D*P1D + comp*P1D*P1D*P1D +
325                                       elem*BASIS_NCOMP*P1D*P1D*P1D ] : 0.0;
326   for (int i = P1D; i < Q1D; i++)
327     r_U[i] = 0.0;
328 }
329 
330 inline __device__ void readQuads3d(const int elem, const int tidx,
331                                    const int tidy, const int comp,
332                                    const int dim, const int nelem, const CeedScalar *d_U, CeedScalar *r_U) {
333   for (int i = 0; i < Q1D; i++)
334     r_U[i] = d_U[tidx + tidy*Q1D + i*Q1D*Q1D + elem*Q1D*Q1D*Q1D +
335                  comp*Q1D*Q1D*Q1D*nelem + dim*BASIS_NCOMP*nelem*Q1D*Q1D*Q1D];
336 }
337 
338 inline __device__ void writeDofs3d(const int elem, const int tidx,
339                                    const int tidy, const int comp,
340                                    const int nelem, const CeedScalar *r_V, CeedScalar *d_V) {
341   if (tidx<P1D && tidy<P1D) {
342     for (int i = 0; i < P1D; i++)
343       d_V[tidx + tidy*P1D + i*P1D*P1D + comp*P1D*P1D*P1D +
344           elem*BASIS_NCOMP*P1D*P1D*P1D ] = r_V[i];
345   }
346 }
347 
348 inline __device__ void writeQuads3d(const int elem, const int tidx,
349                                     const int tidy, const int comp,
350                                     const int dim, const int nelem, const CeedScalar *r_V, CeedScalar *d_V) {
351   for (int i = 0; i < Q1D; i++)
352     d_V[tidx + tidy*Q1D + i*Q1D*Q1D + elem*Q1D*Q1D*Q1D + comp*Q1D*Q1D*Q1D*nelem +
353         dim*BASIS_NCOMP*nelem*Q1D*Q1D*Q1D ] = r_V[i];
354 }
355 
356 inline __device__ void ContractX3d(CeedScalar *slice, const int tidx,
357                                    const int tidy, const int tidz,
358                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
359   for (int k = 0; k < P1D; ++k) {
360     slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U[k];
361     __syncthreads();
362     V[k] = 0.0;
363     for (int i = 0; i < P1D; ++i) {
364       V[k] += B[i + tidx*P1D] * slice[i + tidy*Q1D +
365                                       tidz*Q1D*Q1D];//contract x direction
366     }
367     __syncthreads();
368   }
369 }
370 
371 inline __device__ void ContractY3d(CeedScalar *slice, const int tidx,
372                                    const int tidy, const int tidz,
373                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
374   for (int k = 0; k < P1D; ++k) {
375     slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U[k];
376     __syncthreads();
377     V[k] = 0.0;
378     for (int i = 0; i < P1D; ++i) {
379       V[k] += B[i + tidy*P1D] * slice[tidx + i*Q1D +
380                                       tidz*Q1D*Q1D];//contract y direction
381     }
382     __syncthreads();
383   }
384 }
385 
386 inline __device__ void ContractZ3d(CeedScalar *slice, const int tidx,
387                                    const int tidy, const int tidz,
388                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
389   for (int k = 0; k < Q1D; ++k) {
390     V[k] = 0.0;
391     for (int i = 0; i < P1D; ++i) {
392       V[k] += B[i + k*P1D] * U[i];//contract z direction
393     }
394   }
395 }
396 
397 inline __device__ void ContractTransposeZ3d(CeedScalar *slice, const int tidx,
398     const int tidy, const int tidz,
399     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
400   for (int k = 0; k < Q1D; ++k) {
401     V[k] = 0.0;
402     if (k<P1D) {
403       for (int i = 0; i < Q1D; ++i) {
404         V[k] += B[k + i*P1D] * U[i];//contract z direction
405       }
406     }
407   }
408 }
409 
410 inline __device__ void ContractTransposeY3d(CeedScalar *slice, const int tidx,
411     const int tidy, const int tidz,
412     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
413   for (int k = 0; k < P1D; ++k) {
414     slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U[k];
415     __syncthreads();
416     V[k] = 0.0;
417     if (tidy<P1D) {
418       for (int i = 0; i < Q1D; ++i) {
419         V[k] += B[tidy + i*P1D] * slice[tidx + i*Q1D +
420                                         tidz*Q1D*Q1D];//contract y direction
421       }
422     }
423     __syncthreads();
424   }
425 }
426 
427 inline __device__ void ContractTransposeX3d(CeedScalar *slice, const int tidx,
428     const int tidy, const int tidz,
429     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
430   for (int k = 0; k < P1D; ++k) {
431     slice[tidx+tidy*Q1D+tidz*Q1D*Q1D] = U[k];
432     __syncthreads();
433     V[k] = 0.0;
434     if (tidx<P1D) {
435       for (int i = 0; i < Q1D; ++i) {
436         V[k] += B[tidx + i*P1D] * slice[i + tidy*Q1D +
437                                         tidz*Q1D*Q1D];//contract x direction
438       }
439     }
440     __syncthreads();
441   }
442 }
443 
444 inline __device__ void interp3d(const CeedInt nelem, const int transpose,
445                                 const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
446                                 CeedScalar *__restrict__ d_V,
447                                 CeedScalar *slice) {
448   CeedScalar r_V[Q1D];
449   CeedScalar r_t[Q1D];
450 
451   const int tidx = threadIdx.x;
452   const int tidy = threadIdx.y;
453   const int tidz = threadIdx.z;
454   const int blockElem = tidz/BASIS_NCOMP;
455   const int elemsPerBlock = blockDim.z/BASIS_NCOMP;
456   const int comp = tidz%BASIS_NCOMP;
457 
458   for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem;
459        elem += gridDim.x*elemsPerBlock) {
460     for (int i = 0; i < Q1D; ++i) {
461       r_V[i] = 0.0;
462       r_t[i] = 0.0;
463     }
464     if(!transpose) {
465       readDofs3d(elem, tidx, tidy, comp, nelem, d_U, r_V);
466       ContractX3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
467       ContractY3d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
468       ContractZ3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
469       writeQuads3d(elem, tidx, tidy, comp, 0, nelem, r_t, d_V);
470     } else {
471       readQuads3d(elem, tidx, tidy, comp, 0, nelem, d_U, r_V);
472       ContractTransposeZ3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
473       ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
474       ContractTransposeX3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
475       writeDofs3d(elem, tidx, tidy, comp, nelem, r_t, d_V);
476     }
477   }
478 }
479 
480 inline __device__ void grad3d(const CeedInt nelem, const int transpose,
481                               const CeedScalar *c_B, const CeedScalar *c_G,
482                               const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V,
483                               CeedScalar *slice) {
484   //use P1D for one of these
485   CeedScalar r_U[Q1D];
486   CeedScalar r_V[Q1D];
487   CeedScalar r_t[Q1D];
488 
489   const int tidx = threadIdx.x;
490   const int tidy = threadIdx.y;
491   const int tidz = threadIdx.z;
492   const int blockElem = tidz/BASIS_NCOMP;
493   const int elemsPerBlock = blockDim.z/BASIS_NCOMP;
494   const int comp = tidz%BASIS_NCOMP;
495   int dim;
496 
497   for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem;
498        elem += gridDim.x*elemsPerBlock) {
499     if(!transpose) {
500       readDofs3d(elem, tidx, tidy, comp, nelem, d_U, r_U);
501       ContractX3d(slice, tidx, tidy, tidz, r_U, c_G, r_V);
502       ContractY3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
503       ContractZ3d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
504       dim = 0;
505       writeQuads3d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
506       ContractX3d(slice, tidx, tidy, tidz, r_U, c_B, r_V);
507       ContractY3d(slice, tidx, tidy, tidz, r_V, c_G, r_t);
508       ContractZ3d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
509       dim = 1;
510       writeQuads3d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
511       ContractX3d(slice, tidx, tidy, tidz, r_U, c_B, r_V);
512       ContractY3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
513       ContractZ3d(slice, tidx, tidy, tidz, r_t, c_G, r_V);
514       dim = 2;
515       writeQuads3d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
516     } else {
517       dim = 0;
518       readQuads3d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U);
519       ContractTransposeZ3d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
520       ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_B, r_U);
521       ContractTransposeX3d(slice, tidx, tidy, tidz, r_U, c_G, r_V);
522       dim = 1;
523       readQuads3d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U);
524       ContractTransposeZ3d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
525       ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_G, r_U);
526       ContractTransposeX3d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
527       add(r_V, r_t);
528       dim = 2;
529       readQuads3d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U);
530       ContractTransposeZ3d(slice, tidx, tidy, tidz, r_U, c_G, r_t);
531       ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_B, r_U);
532       ContractTransposeX3d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
533       add(r_V, r_t);
534       writeDofs3d(elem, tidx, tidy, comp, nelem, r_V, d_V);
535     }
536   }
537 }
538 
539 /////////////
540 // Kernels //
541 /////////////
542 extern "C" __global__ void interp(const CeedInt nelem, const int transpose,
543                                   const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
544                                   CeedScalar *__restrict__ d_V) {
545   extern __shared__ double slice[];
546   if (BASIS_DIM==1) {
547     interp1d(nelem, transpose, c_B, d_U, d_V, slice);
548   } else if (BASIS_DIM==2) {
549     interp2d(nelem, transpose, c_B, d_U, d_V, slice);
550   } else if (BASIS_DIM==3) {
551     interp3d(nelem, transpose, c_B, d_U, d_V, slice);
552   }
553 }
554 
555 extern "C" __global__ void grad(const CeedInt nelem, const int transpose,
556                                 const CeedScalar *c_B, const CeedScalar *c_G,
557                                 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
558   extern __shared__ double slice[];
559   if (BASIS_DIM==1) {
560     grad1d(nelem, transpose, c_B, c_G, d_U, d_V, slice);
561   } else if (BASIS_DIM==2) {
562     grad2d(nelem, transpose, c_B, c_G, d_U, d_V, slice);
563   } else if (BASIS_DIM==3) {
564     grad3d(nelem, transpose, c_B, c_G, d_U, d_V, slice);
565   }
566 }
567 
568 /////////////
569 // Weights //
570 /////////////
571 __device__ void weight1d(const CeedInt nelem, const CeedScalar *qweight1d,
572                          CeedScalar *w) {
573   const int tid = threadIdx.x;
574   const CeedScalar weight = qweight1d[tid];
575   for (CeedInt elem = blockIdx.x*blockDim.y + threadIdx.y; elem < nelem;
576        elem += gridDim.x*blockDim.y) {
577     const int ind = elem*Q1D + tid;
578     w[ind] = weight;
579   }
580 }
581 
582 __device__ void weight2d(const CeedInt nelem, const CeedScalar *qweight1d,
583                          CeedScalar *w) {
584   const int i = threadIdx.x;
585   const int j = threadIdx.y;
586   const CeedScalar weight = qweight1d[i]*qweight1d[j];
587   for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
588        elem += gridDim.x*blockDim.z) {
589     const int ind = elem*Q1D*Q1D + i + j*Q1D;
590     w[ind] = weight;
591   }
592 }
593 
594 __device__ void weight3d(const CeedInt nelem, const CeedScalar *qweight1d,
595                          CeedScalar *w) {
596   const int i = threadIdx.x;
597   const int j = threadIdx.y;
598   const int k = threadIdx.z;
599   const CeedScalar weight = qweight1d[i]*qweight1d[j]*qweight1d[k];
600   for (int e = blockIdx.x; e < nelem; e += gridDim.x) {
601     const int ind = e*Q1D*Q1D*Q1D + i + j*Q1D + k*Q1D*Q1D;
602     w[ind] = weight;
603   }
604 }
605 
606 extern "C" __global__ void weight(const CeedInt nelem,
607                                   const CeedScalar *__restrict__ qweight1d, CeedScalar *__restrict__ v) {
608   if (BASIS_DIM==1) {
609     weight1d(nelem, qweight1d, v);
610   } else if (BASIS_DIM==2) {
611     weight2d(nelem, qweight1d, v);
612   } else if (BASIS_DIM==3) {
613     weight3d(nelem, qweight1d, v);
614   }
615 }
616 
617 );
618 
619 int CeedCudaInitInterp(CeedScalar *d_B, CeedInt P1d, CeedInt Q1d,
620                        CeedScalar **c_B);
621 int CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P1d,
622                            CeedInt Q1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr);
623 
624 int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt nelem,
625                                      CeedTransposeMode tmode,
626                                      CeedEvalMode emode, CeedVector u, CeedVector v) {
627   int ierr;
628   Ceed ceed;
629   ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr);
630   Ceed_Cuda_shared *ceed_Cuda;
631   CeedGetData(ceed, (void *) &ceed_Cuda); CeedChk(ierr);
632   CeedBasis_Cuda_shared *data;
633   CeedBasisGetData(basis, (void *)&data); CeedChk(ierr);
634   const CeedInt transpose = tmode == CEED_TRANSPOSE;
635   CeedInt dim, ncomp;
636   ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
637   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChk(ierr);
638 
639   const CeedScalar *d_u;
640   CeedScalar *d_v;
641   if(emode!=CEED_EVAL_WEIGHT) {
642     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChk(ierr);
643   }
644   ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChk(ierr);
645 
646   if (tmode == CEED_TRANSPOSE) {
647     CeedInt length;
648     ierr = CeedVectorGetLength(v, &length); CeedChk(ierr);
649     ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChk(ierr);
650   }
651   if (emode == CEED_EVAL_INTERP) {
652     CeedInt P1d, Q1d;
653     ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
654     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
655     ierr = CeedCudaInitInterp(data->d_interp1d, P1d, Q1d, &data->c_B);
656     CeedChk(ierr);
657     void *interpargs[] = {(void *) &nelem, (void *) &transpose, &data->c_B, &d_u, &d_v};
658     if (dim==1) {
659       CeedInt elemsPerBlock = 32;
660       CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
661                                              ? 1 : 0 );
662       CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar);
663       ierr = CeedRunKernelDimSharedCuda(ceed, data->interp, grid, Q1d, 1,
664                                         elemsPerBlock, sharedMem,
665                                         interpargs);
666       CeedChk(ierr);
667     } else if (dim==2) {
668       const CeedInt optElems[7] = {0,32,8,6,4,2,8};
669       CeedInt elemsPerBlock = Q1d < 7 ? optElems[Q1d]/ncomp : 1;
670       CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
671                                              ? 1 : 0 );
672       CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar);
673       ierr = CeedRunKernelDimSharedCuda(ceed, data->interp, grid, Q1d, Q1d,
674                                         ncomp*elemsPerBlock, sharedMem,
675                                         interpargs);
676       CeedChk(ierr);
677     } else if (dim==3) {
678       CeedInt elemsPerBlock = 1;
679       CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
680                                              ? 1 : 0 );
681       CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar);
682       ierr = CeedRunKernelDimSharedCuda(ceed, data->interp, grid, Q1d, Q1d,
683                                         ncomp*elemsPerBlock, sharedMem,
684                                         interpargs);
685       CeedChk(ierr);
686     }
687   } else if (emode == CEED_EVAL_GRAD) {
688     CeedInt P1d, Q1d;
689     ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
690     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
691     ierr = CeedCudaInitInterpGrad(data->d_interp1d, data->d_grad1d, P1d,
692                                   Q1d, &data->c_B, &data->c_G);
693     CeedChk(ierr);
694     void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->c_B, &data->c_G, &d_u, &d_v};
695     if (dim==1) {
696       CeedInt elemsPerBlock = 32;
697       CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
698                                              ? 1 : 0 );
699       CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar);
700       ierr = CeedRunKernelDimSharedCuda(ceed, data->grad, grid, Q1d, 1, elemsPerBlock,
701                                         sharedMem,
702                                         gradargs);
703       CeedChk(ierr);
704     } else if (dim==2) {
705       const CeedInt optElems[7] = {0,32,8,6,4,2,8};
706       CeedInt elemsPerBlock = Q1d < 7 ? optElems[Q1d]/ncomp : 1;
707       CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
708                                              ? 1 : 0 );
709       CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar);
710       ierr = CeedRunKernelDimSharedCuda(ceed, data->grad, grid, Q1d, Q1d,
711                                         ncomp*elemsPerBlock, sharedMem,
712                                         gradargs);
713       CeedChk(ierr);
714     } else if (dim==3) {
715       CeedInt elemsPerBlock = 1;
716       CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
717                                              ? 1 : 0 );
718       CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar);
719       ierr = CeedRunKernelDimSharedCuda(ceed, data->grad, grid, Q1d, Q1d,
720                                         ncomp*elemsPerBlock, sharedMem,
721                                         gradargs);
722       CeedChk(ierr);
723     }
724   } else if (emode == CEED_EVAL_WEIGHT) {
725     CeedInt Q1d;
726     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
727     void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight1d, &d_v};
728     if(dim==1) {
729       const CeedInt elemsPerBlock = 32/Q1d;
730       const CeedInt gridsize = nelem/elemsPerBlock + ( (
731                                  nelem/elemsPerBlock*elemsPerBlock<nelem)? 1 : 0 );
732       ierr = CeedRunKernelDimCuda(ceed, data->weight, gridsize, Q1d, elemsPerBlock, 1,
733                                   weightargs);
734       CeedChk(ierr);
735     } else if(dim==2) {
736       const CeedInt optElems = 32/(Q1d*Q1d);
737       const CeedInt elemsPerBlock = optElems>0?optElems:1;
738       const CeedInt gridsize = nelem/elemsPerBlock + ( (
739                                  nelem/elemsPerBlock*elemsPerBlock<nelem)? 1 : 0 );
740       ierr = CeedRunKernelDimCuda(ceed, data->weight, gridsize, Q1d, Q1d,
741                                   elemsPerBlock, weightargs);
742       CeedChk(ierr);
743     } else if(dim==3) {
744       const CeedInt gridsize = nelem;
745       ierr = CeedRunKernelDimCuda(ceed, data->weight, gridsize, Q1d, Q1d, Q1d,
746                                   weightargs);
747       CeedChk(ierr);
748     }
749   }
750 
751   if(emode!=CEED_EVAL_WEIGHT) {
752     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChk(ierr);
753   }
754   ierr = CeedVectorRestoreArray(v, &d_v); CeedChk(ierr);
755 
756   return 0;
757 }
758 
759 static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
760   int ierr;
761   Ceed ceed;
762   ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr);
763 
764   CeedBasis_Cuda_shared *data;
765   ierr = CeedBasisGetData(basis, (void *) &data); CeedChk(ierr);
766 
767   CeedChk_Cu(ceed, cuModuleUnload(data->module));
768 
769   ierr = cudaFree(data->d_qweight1d); CeedChk_Cu(ceed, ierr);
770   ierr = cudaFree(data->d_interp1d); CeedChk_Cu(ceed, ierr);
771   ierr = cudaFree(data->d_grad1d); CeedChk_Cu(ceed, ierr);
772 
773   ierr = CeedFree(&data); CeedChk(ierr);
774 
775   return 0;
776 }
777 
778 int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P1d, CeedInt Q1d,
779                                         const CeedScalar *interp1d,
780                                         const CeedScalar *grad1d,
781                                         const CeedScalar *qref1d,
782                                         const CeedScalar *qweight1d,
783                                         CeedBasis basis) {
784   int ierr;
785   Ceed ceed;
786   ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr);
787   if (Q1d<P1d) {
788     return CeedError(ceed, 1, "Backend does not implement underintegrated basis.");
789   }
790   CeedBasis_Cuda_shared *data;
791   ierr = CeedCalloc(1, &data); CeedChk(ierr);
792 
793   const CeedInt qBytes = Q1d * sizeof(CeedScalar);
794   ierr = cudaMalloc((void **)&data->d_qweight1d, qBytes); CeedChk_Cu(ceed, ierr);
795   ierr = cudaMemcpy(data->d_qweight1d, qweight1d, qBytes,
796                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
797 
798   const CeedInt iBytes = qBytes * P1d;
799   ierr = cudaMalloc((void **)&data->d_interp1d, iBytes); CeedChk_Cu(ceed, ierr);
800   ierr = cudaMemcpy(data->d_interp1d, interp1d, iBytes,
801                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
802 
803   ierr = cudaMalloc((void **)&data->d_grad1d, iBytes); CeedChk_Cu(ceed, ierr);
804   ierr = cudaMemcpy(data->d_grad1d, grad1d, iBytes,
805                     cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
806 
807   data->d_collograd1d = NULL;
808   if (dim==3 && Q1d >= P1d) {
809     CeedScalar *collograd1d;
810     ierr = CeedMalloc(Q1d*Q1d, &collograd1d); CeedChk(ierr);
811     ierr = CeedBasisGetCollocatedGrad(basis, collograd1d); CeedChk(ierr);
812     ierr = cudaMalloc((void **)&data->d_collograd1d, qBytes * Q1d);
813     CeedChk_Cu(ceed, ierr);
814     ierr = cudaMemcpy(data->d_collograd1d, collograd1d, qBytes * Q1d,
815                       cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
816   }
817 
818   CeedInt ncomp;
819   ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChk(ierr);
820   ierr = CeedCompileCuda(ceed, kernelsShared, &data->module, 7,
821                          "Q1D", Q1d,
822                          "P1D", P1d,
823                          "BASIS_BUF_LEN", ncomp * CeedIntPow(Q1d > P1d ?
824                              Q1d : P1d, dim),
825                          "BASIS_DIM", dim,
826                          "BASIS_NCOMP", ncomp,
827                          "BASIS_ELEMSIZE", CeedIntPow(P1d, dim),
828                          "BASIS_NQPT", CeedIntPow(Q1d, dim)
829                         ); CeedChk(ierr);
830   ierr = CeedGetKernelCuda(ceed, data->module, "interp", &data->interp);
831   CeedChk(ierr);
832   ierr = CeedGetKernelCuda(ceed, data->module, "grad", &data->grad);
833   CeedChk(ierr);
834   ierr = CeedGetKernelCuda(ceed, data->module, "weight", &data->weight);
835   CeedChk(ierr);
836 
837   ierr = CeedBasisSetData(basis, (void *)&data);
838   CeedChk(ierr);
839   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
840                                 CeedBasisApplyTensor_Cuda_shared);
841   CeedChk(ierr);
842   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
843                                 CeedBasisDestroy_Cuda_shared);
844   CeedChk(ierr);
845   return 0;
846 }
847