xref: /libCEED/rust/libceed-sys/c-src/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 6d69246ad8d06846a22b2f7e3720a77f7a77e7f6)
1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4241a4b83SYohann //
5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7241a4b83SYohann // element discretizations for exascale applications. For more information and
8241a4b83SYohann // source code availability see http://github.com/ceed.
9241a4b83SYohann //
10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
163d576824SJeremy L Thompson 
17b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12
187df94212SJeremy L Thompson 
19ec3da8bcSJed Brown #include <ceed/ceed.h>
20ec3da8bcSJed Brown #include <ceed/backend.h>
213d576824SJeremy L Thompson #include <cuda_runtime.h>
22241a4b83SYohann #include <iostream>
23241a4b83SYohann #include <sstream>
243d576824SJeremy L Thompson #include "ceed-cuda-gen.h"
25*6d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h"
26241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
27241a4b83SYohann 
28f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
29ab213215SJeremy L Thompson //------------------------------------------------------------------------------
30ab213215SJeremy L Thompson // Atomic add, for older CUDA
31ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3280a9ef05SNatalie Beams __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) {
33241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
34241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
35241a4b83SYohann   do {
36241a4b83SYohann     assumed = old;
37241a4b83SYohann     old =
38241a4b83SYohann       atomicCAS(address_as_ull, assumed,
39241a4b83SYohann                 __double_as_longlong(val +
40241a4b83SYohann                                     __longlong_as_double(assumed)));
41241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
42241a4b83SYohann     // (since NaN != NaN)
43241a4b83SYohann   } while (assumed != old);
44241a4b83SYohann   return __longlong_as_double(old);
45241a4b83SYohann }
46f1a13f77SYohann Dudouit );
47f1a13f77SYohann Dudouit 
48f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
49f1a13f77SYohann Dudouit 
50ab213215SJeremy L Thompson //------------------------------------------------------------------------------
51ab213215SJeremy L Thompson // Typedefs
52ab213215SJeremy L Thompson //------------------------------------------------------------------------------
53f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
54f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
55f1a13f77SYohann Dudouit 
56f1a13f77SYohann Dudouit typedef struct {
57f1a13f77SYohann Dudouit   CeedInt tidx;
58f1a13f77SYohann Dudouit   CeedInt tidy;
59f1a13f77SYohann Dudouit   CeedInt tidz;
60f1a13f77SYohann Dudouit   CeedInt tid;
61f1a13f77SYohann Dudouit   CeedScalar* slice;
62f1a13f77SYohann Dudouit } BackendData;
63241a4b83SYohann 
64ab213215SJeremy L Thompson //------------------------------------------------------------------------------
65ab213215SJeremy L Thompson // Load matrices for basis actions
66ab213215SJeremy L Thompson //------------------------------------------------------------------------------
67241a4b83SYohann template <int P, int Q>
68053543deSYohann Dudouit inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
69ab213215SJeremy L Thompson   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
70241a4b83SYohann     B[i] = d_B[i];
71241a4b83SYohann }
72241a4b83SYohann 
73ab213215SJeremy L Thompson //------------------------------------------------------------------------------
74241a4b83SYohann // 1D
75ab213215SJeremy L Thompson //------------------------------------------------------------------------------
76ab213215SJeremy L Thompson 
77ab213215SJeremy L Thompson //------------------------------------------------------------------------------
78ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
79ab213215SJeremy L Thompson //------------------------------------------------------------------------------
805c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
81053543deSYohann Dudouit inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
82ab213215SJeremy L Thompson   if (data.tidx < P1d) {
834d537eeaSYohann     const CeedInt node = data.tidx;
84ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
85ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
865c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
87241a4b83SYohann   }
88241a4b83SYohann }
89920dcdc4Sjeremylt 
90ab213215SJeremy L Thompson //------------------------------------------------------------------------------
91ab213215SJeremy L Thompson // L-vector -> E-vector, strided
92ab213215SJeremy L Thompson //------------------------------------------------------------------------------
93d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
94053543deSYohann Dudouit inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
95ab213215SJeremy L Thompson   if (data.tidx < P1d) {
96ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
97d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
98ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
99d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
100ccedf6b0Sjeremylt   }
101ccedf6b0Sjeremylt }
102241a4b83SYohann 
103ab213215SJeremy L Thompson //------------------------------------------------------------------------------
104ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
105ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1065c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
107053543deSYohann Dudouit inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
108ab213215SJeremy L Thompson   if (data.tidx < P1d) {
1094d537eeaSYohann     const CeedInt node = data.tidx;
110ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
111ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1125c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
113241a4b83SYohann   }
114241a4b83SYohann }
115241a4b83SYohann 
116ab213215SJeremy L Thompson //------------------------------------------------------------------------------
117ab213215SJeremy L Thompson // E-vector -> L-vector, strided
118ab213215SJeremy L Thompson //------------------------------------------------------------------------------
119d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
120d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
121ab213215SJeremy L Thompson   if (data.tidx < P1d) {
122ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
123d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
124ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
125d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
126ccedf6b0Sjeremylt   }
127ccedf6b0Sjeremylt }
128ccedf6b0Sjeremylt 
129ab213215SJeremy L Thompson //------------------------------------------------------------------------------
130ab213215SJeremy L Thompson // 1D tensor contraction x
131ab213215SJeremy L Thompson //------------------------------------------------------------------------------
132241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
133ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
134241a4b83SYohann   data.slice[data.tidx] = *U;
135241a4b83SYohann   __syncthreads();
136241a4b83SYohann   *V = 0.0;
13718d499f1SYohann   if (data.tidx < Q1d)
138ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
139ab213215SJeremy L Thompson       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
140241a4b83SYohann   __syncthreads();
141241a4b83SYohann }
142241a4b83SYohann 
143ab213215SJeremy L Thompson //------------------------------------------------------------------------------
144ab213215SJeremy L Thompson // 1D transpose tensor contraction x
145ab213215SJeremy L Thompson //------------------------------------------------------------------------------
146241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
147ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
148241a4b83SYohann   data.slice[data.tidx] = *U;
149241a4b83SYohann   __syncthreads();
150241a4b83SYohann   *V = 0.0;
15118d499f1SYohann   if (data.tidx < P1d)
152ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
153ab213215SJeremy L Thompson       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
154241a4b83SYohann   __syncthreads();
155241a4b83SYohann }
156241a4b83SYohann 
157ab213215SJeremy L Thompson //------------------------------------------------------------------------------
158ab213215SJeremy L Thompson // 1D interpolate to quadrature points
159ab213215SJeremy L Thompson //------------------------------------------------------------------------------
160241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
161ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
162ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
163241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
164241a4b83SYohann }
165241a4b83SYohann 
166ab213215SJeremy L Thompson //------------------------------------------------------------------------------
167ab213215SJeremy L Thompson // 1D interpolate transpose
168ab213215SJeremy L Thompson //------------------------------------------------------------------------------
169241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
170ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
171ab213215SJeremy L Thompson   for (CeedInt comp=0; comp<NCOMP; comp++)
172241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
173241a4b83SYohann }
174241a4b83SYohann 
175ab213215SJeremy L Thompson //------------------------------------------------------------------------------
176ab213215SJeremy L Thompson // 1D derivatives at quadrature points
177ab213215SJeremy L Thompson //------------------------------------------------------------------------------
178241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
179ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
180ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
181241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
182241a4b83SYohann }
183241a4b83SYohann 
184ab213215SJeremy L Thompson //------------------------------------------------------------------------------
185ab213215SJeremy L Thompson // 1D derivatives transpose
186ab213215SJeremy L Thompson //------------------------------------------------------------------------------
187241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
188ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
189ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
190241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
191241a4b83SYohann }
192241a4b83SYohann 
193ab213215SJeremy L Thompson //------------------------------------------------------------------------------
194241a4b83SYohann // 2D
195ab213215SJeremy L Thompson //------------------------------------------------------------------------------
196ab213215SJeremy L Thompson 
197ab213215SJeremy L Thompson //------------------------------------------------------------------------------
198ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
199ab213215SJeremy L Thompson //------------------------------------------------------------------------------
2005c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
201053543deSYohann Dudouit inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
202ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
2034d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
204ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
205ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2065c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
207241a4b83SYohann   }
208241a4b83SYohann }
209920dcdc4Sjeremylt 
210ab213215SJeremy L Thompson //------------------------------------------------------------------------------
211ab213215SJeremy L Thompson // L-vector -> E-vector, strided
212ab213215SJeremy L Thompson //------------------------------------------------------------------------------
213d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
214053543deSYohann Dudouit inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
215ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
216ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
217d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
218ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
219d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
220ccedf6b0Sjeremylt   }
221ccedf6b0Sjeremylt }
222241a4b83SYohann 
223ab213215SJeremy L Thompson //------------------------------------------------------------------------------
224ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
225ab213215SJeremy L Thompson //------------------------------------------------------------------------------
2265c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
227053543deSYohann Dudouit inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
228ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
2294d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
230ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
231ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2325c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
233241a4b83SYohann   }
234241a4b83SYohann }
235241a4b83SYohann 
236ab213215SJeremy L Thompson //------------------------------------------------------------------------------
237ab213215SJeremy L Thompson // E-vector -> L-vector, strided
238ab213215SJeremy L Thompson //------------------------------------------------------------------------------
239d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
240d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
241ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
242ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
243d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
244ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
245d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
246ccedf6b0Sjeremylt   }
247ccedf6b0Sjeremylt }
248ccedf6b0Sjeremylt 
249ab213215SJeremy L Thompson //------------------------------------------------------------------------------
250ab213215SJeremy L Thompson // 2D tensor contraction x
251ab213215SJeremy L Thompson //------------------------------------------------------------------------------
252241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
253ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
25418d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
255241a4b83SYohann   __syncthreads();
256241a4b83SYohann   *V = 0.0;
25718d499f1SYohann   if (data.tidx < Q1d && data.tidy < P1d)
258ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
25918d499f1SYohann       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
260241a4b83SYohann   __syncthreads();
261241a4b83SYohann }
262241a4b83SYohann 
263ab213215SJeremy L Thompson //------------------------------------------------------------------------------
264ab213215SJeremy L Thompson // 2D tensor contract y
265ab213215SJeremy L Thompson //------------------------------------------------------------------------------
266241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
267ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
26818d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
269241a4b83SYohann   __syncthreads();
270241a4b83SYohann   *V = 0.0;
27118d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d)
272ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
27318d499f1SYohann       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
274241a4b83SYohann   __syncthreads();
275241a4b83SYohann }
276241a4b83SYohann 
277ab213215SJeremy L Thompson //------------------------------------------------------------------------------
278ab213215SJeremy L Thompson // 2D transpose tensor contract y
279ab213215SJeremy L Thompson //------------------------------------------------------------------------------
280241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
281ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
28218d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
283241a4b83SYohann   __syncthreads();
284241a4b83SYohann   *V = 0.0;
28518d499f1SYohann   if (data.tidx < Q1d && data.tidy < P1d)
286ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
28718d499f1SYohann       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
288241a4b83SYohann   __syncthreads();
289241a4b83SYohann }
290241a4b83SYohann 
291ab213215SJeremy L Thompson //------------------------------------------------------------------------------
292ab213215SJeremy L Thompson // 2D transpose tensor contract x
293ab213215SJeremy L Thompson //------------------------------------------------------------------------------
294241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
295ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
29618d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
297241a4b83SYohann   __syncthreads();
298241a4b83SYohann   *V = 0.0;
29918d499f1SYohann   if (data.tidx < P1d && data.tidy < P1d)
300ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
30118d499f1SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
302241a4b83SYohann   __syncthreads();
303241a4b83SYohann }
304241a4b83SYohann 
305ab213215SJeremy L Thompson //------------------------------------------------------------------------------
306ab213215SJeremy L Thompson // 2D transpose tensor contract and add x
307ab213215SJeremy L Thompson //------------------------------------------------------------------------------
308241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
309ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
31018d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
311241a4b83SYohann   __syncthreads();
31218d499f1SYohann   if (data.tidx < P1d && data.tidy < P1d)
313ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
31418d499f1SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
315241a4b83SYohann   __syncthreads();
316241a4b83SYohann }
317241a4b83SYohann 
318ab213215SJeremy L Thompson //------------------------------------------------------------------------------
319ab213215SJeremy L Thompson // 2D interpolate to quadrature points
320ab213215SJeremy L Thompson //------------------------------------------------------------------------------
321241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
322ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
323241a4b83SYohann   CeedScalar r_t[1];
324ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
325241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
326241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
327241a4b83SYohann   }
328241a4b83SYohann }
329241a4b83SYohann 
330ab213215SJeremy L Thompson //------------------------------------------------------------------------------
331ab213215SJeremy L Thompson // 2D interpolate transpose
332ab213215SJeremy L Thompson //------------------------------------------------------------------------------
333241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
334ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
335241a4b83SYohann   CeedScalar r_t[1];
336ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
337241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
338241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
339241a4b83SYohann   }
340241a4b83SYohann }
341241a4b83SYohann 
342ab213215SJeremy L Thompson //------------------------------------------------------------------------------
343ab213215SJeremy L Thompson // 2D derivatives at quadrature points
344ab213215SJeremy L Thompson //------------------------------------------------------------------------------
345241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
346ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
347241a4b83SYohann   CeedScalar r_t[1];
348ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
349241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
350241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
351241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
352241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
353241a4b83SYohann   }
354241a4b83SYohann }
355241a4b83SYohann 
356ab213215SJeremy L Thompson //------------------------------------------------------------------------------
357ab213215SJeremy L Thompson // 2D derivatives transpose
358ab213215SJeremy L Thompson //------------------------------------------------------------------------------
359241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
360ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
361241a4b83SYohann   CeedScalar r_t[1];
362ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
363241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
364241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
365241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
366241a4b83SYohann     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
367241a4b83SYohann   }
368241a4b83SYohann }
369241a4b83SYohann 
370ab213215SJeremy L Thompson //------------------------------------------------------------------------------
371241a4b83SYohann // 3D
372ab213215SJeremy L Thompson //------------------------------------------------------------------------------
373ab213215SJeremy L Thompson 
374ab213215SJeremy L Thompson //------------------------------------------------------------------------------
375ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
376ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3775c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
378053543deSYohann Dudouit inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
379ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
380241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
3814d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
382ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
383ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3845c7b696cSjeremylt         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
385241a4b83SYohann     }
386241a4b83SYohann }
387920dcdc4Sjeremylt 
388ab213215SJeremy L Thompson //------------------------------------------------------------------------------
389ab213215SJeremy L Thompson // L-vector -> E-vector, strided
390ab213215SJeremy L Thompson //------------------------------------------------------------------------------
391d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
392053543deSYohann Dudouit inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
393ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
394ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
395ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
396d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
397ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
398d80fc06aSjeremylt         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
399ccedf6b0Sjeremylt     }
400ccedf6b0Sjeremylt }
401241a4b83SYohann 
402ab213215SJeremy L Thompson //------------------------------------------------------------------------------
403ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided
404ab213215SJeremy L Thompson //------------------------------------------------------------------------------
4055c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d>
406053543deSYohann Dudouit inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
40718d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
408ac421f39SYohann     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
409920dcdc4Sjeremylt     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
410ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4115c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
412ac421f39SYohann   }
41318d499f1SYohann }
414ac421f39SYohann 
415ab213215SJeremy L Thompson //------------------------------------------------------------------------------
416ab213215SJeremy L Thompson // E-vector -> Q-vector, strided
417ab213215SJeremy L Thompson //------------------------------------------------------------------------------
418d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
419053543deSYohann Dudouit inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
42018d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
421920dcdc4Sjeremylt     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
422d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
423ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
424d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
425920dcdc4Sjeremylt   }
42618d499f1SYohann }
427920dcdc4Sjeremylt 
428ab213215SJeremy L Thompson //------------------------------------------------------------------------------
429ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
430ab213215SJeremy L Thompson //------------------------------------------------------------------------------
4315c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
432053543deSYohann Dudouit inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
433ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
434241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4354d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
436ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
437ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4385c7b696cSjeremylt         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
439241a4b83SYohann     }
440241a4b83SYohann }
441241a4b83SYohann 
442ab213215SJeremy L Thompson //------------------------------------------------------------------------------
443ab213215SJeremy L Thompson // E-vector -> L-vector, strided
444ab213215SJeremy L Thompson //------------------------------------------------------------------------------
445d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4462f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
447ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
448ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
449ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
450d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
451ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
452d80fc06aSjeremylt         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
453ccedf6b0Sjeremylt     }
454ccedf6b0Sjeremylt }
455ccedf6b0Sjeremylt 
456ab213215SJeremy L Thompson //------------------------------------------------------------------------------
457ab213215SJeremy L Thompson // 3D tensor contract x
458ab213215SJeremy L Thompson //------------------------------------------------------------------------------
459241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
460ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
461ac421f39SYohann   CeedScalar r_B[P1d];
462ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
463ac421f39SYohann     r_B[i] = B[i + data.tidx*P1d];
464ab213215SJeremy L Thompson 
465ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
46618d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
467241a4b83SYohann     __syncthreads();
468241a4b83SYohann     V[k] = 0.0;
46918d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
470ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
47118d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
472241a4b83SYohann     __syncthreads();
473241a4b83SYohann   }
474241a4b83SYohann }
475241a4b83SYohann 
476ab213215SJeremy L Thompson //------------------------------------------------------------------------------
477ab213215SJeremy L Thompson // 3D tensor contract y
478ab213215SJeremy L Thompson //------------------------------------------------------------------------------
479241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
480ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
481ac421f39SYohann   CeedScalar r_B[P1d];
482ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
483ac421f39SYohann     r_B[i] = B[i + data.tidy*P1d];
484ab213215SJeremy L Thompson 
485ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
48618d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
487241a4b83SYohann     __syncthreads();
488241a4b83SYohann     V[k] = 0.0;
48918d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
490ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
49118d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
492241a4b83SYohann     __syncthreads();
493241a4b83SYohann   }
494241a4b83SYohann }
495241a4b83SYohann 
496ab213215SJeremy L Thompson //------------------------------------------------------------------------------
497ab213215SJeremy L Thompson // 3D tensor contract z
498ab213215SJeremy L Thompson //------------------------------------------------------------------------------
499241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
500ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
501ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
502241a4b83SYohann     V[k] = 0.0;
50318d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
504ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
505ab213215SJeremy L Thompson         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
506241a4b83SYohann   }
507241a4b83SYohann }
508241a4b83SYohann 
509ab213215SJeremy L Thompson //------------------------------------------------------------------------------
510ab213215SJeremy L Thompson // 3D transpose tensor contract z
511ab213215SJeremy L Thompson //------------------------------------------------------------------------------
512241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
513ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
51418d499f1SYohann   for (CeedInt k = 0; k < P1d; ++k) {
515241a4b83SYohann     V[k] = 0.0;
51618d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
517ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
518ab213215SJeremy L Thompson         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
519241a4b83SYohann   }
520241a4b83SYohann }
521241a4b83SYohann 
522ab213215SJeremy L Thompson //------------------------------------------------------------------------------
523ab213215SJeremy L Thompson // 3D transpose tensor contract y
524ab213215SJeremy L Thompson //------------------------------------------------------------------------------
525241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
526ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
527ac421f39SYohann   CeedScalar r_B[Q1d];
528ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i)
529ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
530ab213215SJeremy L Thompson 
531ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
53218d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
533241a4b83SYohann     __syncthreads();
534241a4b83SYohann     V[k] = 0.0;
53518d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
536ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
53718d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
538ac421f39SYohann     __syncthreads();
539ac421f39SYohann   }
540ac421f39SYohann }
541ac421f39SYohann 
542ab213215SJeremy L Thompson //------------------------------------------------------------------------------
543ab213215SJeremy L Thompson // 3D transpose tensor contract add y
544ab213215SJeremy L Thompson //------------------------------------------------------------------------------
545ac421f39SYohann template <int NCOMP, int P1d, int Q1d>
546ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
547ac421f39SYohann   CeedScalar r_B[Q1d];
548ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
549ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
550ab213215SJeremy L Thompson 
551ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
55218d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
553ac421f39SYohann     __syncthreads();
55418d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
555ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
55618d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
557241a4b83SYohann     __syncthreads();
558241a4b83SYohann   }
559241a4b83SYohann }
560241a4b83SYohann 
561ab213215SJeremy L Thompson //------------------------------------------------------------------------------
562ab213215SJeremy L Thompson // 3D transpose tensor contract x
563ab213215SJeremy L Thompson //------------------------------------------------------------------------------
564241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
565ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
566ac421f39SYohann   CeedScalar r_B[Q1d];
567ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
568ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
569ab213215SJeremy L Thompson 
570ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
57118d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
572241a4b83SYohann     __syncthreads();
573241a4b83SYohann     V[k] = 0.0;
57418d499f1SYohann     if (data.tidx < P1d && data.tidy < P1d)
575ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
57618d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
577241a4b83SYohann     __syncthreads();
578241a4b83SYohann   }
579241a4b83SYohann }
580241a4b83SYohann 
581ab213215SJeremy L Thompson //------------------------------------------------------------------------------
582ab213215SJeremy L Thompson // 3D transpose tensor contract add x
583ab213215SJeremy L Thompson //------------------------------------------------------------------------------
584241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
585ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
586ac421f39SYohann   CeedScalar r_B[Q1d];
587ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
588ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
589ab213215SJeremy L Thompson 
590ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
59118d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
592241a4b83SYohann     __syncthreads();
59318d499f1SYohann     if (data.tidx < P1d && data.tidy < P1d)
594ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
59518d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
596241a4b83SYohann     __syncthreads();
597241a4b83SYohann   }
598241a4b83SYohann }
599241a4b83SYohann 
600ab213215SJeremy L Thompson //------------------------------------------------------------------------------
601ab213215SJeremy L Thompson // 3D interpolate to quadrature points
602ab213215SJeremy L Thompson //------------------------------------------------------------------------------
603241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
604ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
60518d499f1SYohann   CeedScalar r_t1[T1d];
60618d499f1SYohann   CeedScalar r_t2[T1d];
607ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
608241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
609241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
610241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
611241a4b83SYohann   }
612241a4b83SYohann }
613241a4b83SYohann 
614ab213215SJeremy L Thompson //------------------------------------------------------------------------------
615ab213215SJeremy L Thompson // 3D interpolate transpose
616ab213215SJeremy L Thompson //------------------------------------------------------------------------------
617241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
618ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
61918d499f1SYohann   CeedScalar r_t1[T1d];
62018d499f1SYohann   CeedScalar r_t2[T1d];
621ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
622241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
623241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
624241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
625241a4b83SYohann   }
626241a4b83SYohann }
627241a4b83SYohann 
628ab213215SJeremy L Thompson //------------------------------------------------------------------------------
629ab213215SJeremy L Thompson // 3D derivatives at quadrature points
630ab213215SJeremy L Thompson //------------------------------------------------------------------------------
631241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
632ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
63318d499f1SYohann   CeedScalar r_t1[T1d];
63418d499f1SYohann   CeedScalar r_t2[T1d];
635ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
636241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
637241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
638241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
639241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
640241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
641241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
642241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
643241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
644241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
645241a4b83SYohann   }
646241a4b83SYohann }
647241a4b83SYohann 
648ab213215SJeremy L Thompson //------------------------------------------------------------------------------
649ab213215SJeremy L Thompson // 3D derivatives transpose
650ab213215SJeremy L Thompson //------------------------------------------------------------------------------
651241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
652ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
65318d499f1SYohann   CeedScalar r_t1[T1d];
65418d499f1SYohann   CeedScalar r_t2[T1d];
655ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
656241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
657241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
658241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
659241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
660241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
661241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
662241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
663241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
664241a4b83SYohann     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
665241a4b83SYohann   }
666241a4b83SYohann }
667241a4b83SYohann 
668ab213215SJeremy L Thompson //------------------------------------------------------------------------------
669ab213215SJeremy L Thompson // 3D collocated derivatives computation
670ab213215SJeremy L Thompson //------------------------------------------------------------------------------
671ac421f39SYohann template <int NCOMP, int Q1d>
672ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
67318d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
674ac421f39SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
67518d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
676ac421f39SYohann       __syncthreads();
677ac421f39SYohann       // X derivative
678ac421f39SYohann       r_V[comp+0*NCOMP] = 0.0;
679ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
68018d499f1SYohann         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
681ac421f39SYohann       // Y derivative
682ac421f39SYohann       r_V[comp+1*NCOMP] = 0.0;
683ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
68418d499f1SYohann         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
685ac421f39SYohann       // Z derivative
686ac421f39SYohann       r_V[comp+2*NCOMP] = 0.0;
687ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
688ab213215SJeremy L Thompson         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
689ac421f39SYohann       __syncthreads();
690ac421f39SYohann     }
691ac421f39SYohann   }
69218d499f1SYohann }
693ac421f39SYohann 
694ab213215SJeremy L Thompson //------------------------------------------------------------------------------
695ab213215SJeremy L Thompson // 3D collocated derivatives transpose
696ab213215SJeremy L Thompson //------------------------------------------------------------------------------
697ac421f39SYohann template <int NCOMP, int Q1d>
698ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
69918d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
700ac421f39SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
701ac421f39SYohann       // X derivative
70218d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
703ac421f39SYohann       __syncthreads();
704ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
70518d499f1SYohann         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
706ac421f39SYohann       __syncthreads();
707ac421f39SYohann       // Y derivative
70818d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
709ac421f39SYohann       __syncthreads();
710ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
71118d499f1SYohann         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
712ac421f39SYohann       __syncthreads();
713ac421f39SYohann       // Z derivative
714ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
715ac421f39SYohann         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
716ac421f39SYohann     }
717ac421f39SYohann   }
71818d499f1SYohann }
719ac421f39SYohann 
720ab213215SJeremy L Thompson //------------------------------------------------------------------------------
721ab213215SJeremy L Thompson // 1D quadrature weights
722ab213215SJeremy L Thompson //------------------------------------------------------------------------------
723241a4b83SYohann template <int Q1d>
724053543deSYohann Dudouit inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
72518d499f1SYohann   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
726241a4b83SYohann }
727241a4b83SYohann 
728ab213215SJeremy L Thompson //------------------------------------------------------------------------------
729ab213215SJeremy L Thompson // 2D quadrature weights
730ab213215SJeremy L Thompson //------------------------------------------------------------------------------
731241a4b83SYohann template <int Q1d>
732053543deSYohann Dudouit inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
73318d499f1SYohann   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
73418d499f1SYohann         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
735241a4b83SYohann }
736241a4b83SYohann 
737ab213215SJeremy L Thompson //------------------------------------------------------------------------------
738ab213215SJeremy L Thompson // 3D quadrature weights
739ab213215SJeremy L Thompson //------------------------------------------------------------------------------
740241a4b83SYohann template <int Q1d>
741053543deSYohann Dudouit inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
74218d499f1SYohann   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
74318d499f1SYohann   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
744ac421f39SYohann   for (CeedInt z = 0; z < Q1d; ++z)
74518d499f1SYohann     w[z] = quad ? pw*qweight1d[z] : 0.0;
746241a4b83SYohann }
747241a4b83SYohann 
748241a4b83SYohann );
749ab213215SJeremy L Thompson //------------------------------------------------------------------------------
750ab213215SJeremy L Thompson // Build singe operator kernel
751ab213215SJeremy L Thompson //------------------------------------------------------------------------------
752241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
753241a4b83SYohann 
754241a4b83SYohann   using std::ostringstream;
755241a4b83SYohann   using std::string;
756241a4b83SYohann   int ierr;
757241a4b83SYohann   bool setupdone;
758e15f9bd0SJeremy L Thompson   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
759e15f9bd0SJeremy L Thompson   if (setupdone) return CEED_ERROR_SUCCESS;
760241a4b83SYohann   Ceed ceed;
761e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
762241a4b83SYohann   CeedOperator_Cuda_gen *data;
763e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
764241a4b83SYohann   CeedQFunction qf;
765241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
766e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
767e15f9bd0SJeremy L Thompson   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
76888db6d3bSJeremy L Thompson   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
7695c7b696cSjeremylt           numoutputfields, ncomp, dim = 0, lsize;
770e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
771e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
772241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
7737e7773b5SJeremy L Thompson   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
774e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
775241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
7767e7773b5SJeremy L Thompson   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
777e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
778241a4b83SYohann   CeedEvalMode emode;
779241a4b83SYohann   CeedBasis basis;
780241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
781241a4b83SYohann   CeedElemRestriction Erestrict;
782461525f5SNatalie Beams   CeedElemRestriction_Cuda *restr_data;
783241a4b83SYohann 
7840b454692Sjeremylt   // Check for restriction only identity operator
7850b454692Sjeremylt   bool is_identity_qf;
7860b454692Sjeremylt   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
7870b454692Sjeremylt   if (is_identity_qf) {
7880b454692Sjeremylt     CeedEvalMode emodein, emodeout;
7890b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
7900b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
7910b454692Sjeremylt     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
7920b454692Sjeremylt       // LCOV_EXCL_START
7930b454692Sjeremylt       return CeedError(ceed, CEED_ERROR_BACKEND,
7940b454692Sjeremylt                        "Backend does not implement restriction only identity operators");
7950b454692Sjeremylt     // LCOV_EXCL_STOP
7960b454692Sjeremylt   }
7970b454692Sjeremylt 
798241a4b83SYohann   ostringstream code;
799241a4b83SYohann   string devFunctions(deviceFunctions);
800241a4b83SYohann 
801f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
802f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
803f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
80488db6d3bSJeremy L Thompson   ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr);
80588db6d3bSJeremy L Thompson   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); CeedChkBackend(ierr);
80680a9ef05SNatalie Beams   if ((prop.major<6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){
807f1a13f77SYohann Dudouit     code << atomicAdd;
808f1a13f77SYohann Dudouit   }
809f1a13f77SYohann Dudouit 
810241a4b83SYohann   code << devFunctions;
811241a4b83SYohann 
812241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
813c3c443faSYohann Dudouit   string qFunctionName(qf_data->qFunctionName);
814c3c443faSYohann Dudouit   string oper;
81514cce66cSYohann Dudouit   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
8164d537eeaSYohann 
8174d537eeaSYohann   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
81877841947SLeila Ghaffari   code << "#define CEED_QFUNCTION_HELPER inline __device__\n";
819e15f9bd0SJeremy L Thompson   code << "#define CeedPragmaSIMD\n";
820e15f9bd0SJeremy L Thompson   code << "#define CEED_ERROR_SUCCESS 0\n\n";
8211da99368SJeremy L Thompson 
8221da99368SJeremy L Thompson   // Find dim and Q1d
82364d3f0c0Sjeremylt   bool useCollograd = true;
82418d499f1SYohann   data->maxP1d = 0;
8251da99368SJeremy L Thompson   for (CeedInt i = 0; i < numinputfields; i++) {
826e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
8271da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
828e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8290f54b25eSjeremylt       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
830e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8311da99368SJeremy L Thompson 
8321da99368SJeremy L Thompson       // Check for collocated gradient
83318d499f1SYohann       useCollograd = useCollograd && basis_data->d_collograd1d;
8341da99368SJeremy L Thompson 
8351da99368SJeremy L Thompson       // Collect dim and Q1d
836e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8371da99368SJeremy L Thompson       bool isTensor;
838e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8391da99368SJeremy L Thompson       if (isTensor) {
840e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
841e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
84218d499f1SYohann         if (P1d>data->maxP1d) data->maxP1d = P1d;
8431da99368SJeremy L Thompson       } else {
844e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
845e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
846e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8471da99368SJeremy L Thompson         }
8481da99368SJeremy L Thompson     }
8491da99368SJeremy L Thompson   }
8501da99368SJeremy L Thompson   // Check output bases for Q1d, dim as well
8511da99368SJeremy L Thompson   //   The only imput basis might be CEED_BASIS_COLLOCATED
8521da99368SJeremy L Thompson   for (CeedInt i = 0; i < numoutputfields; i++) {
853e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
8540f54b25eSjeremylt 
8551da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
856e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8570f54b25eSjeremylt       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
858e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8590f54b25eSjeremylt 
8601da99368SJeremy L Thompson       // Collect dim and Q1d
861e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8621da99368SJeremy L Thompson       bool isTensor;
863e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8641da99368SJeremy L Thompson       if (isTensor) {
865e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
8661da99368SJeremy L Thompson       } else {
867e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
868e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
869e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8701da99368SJeremy L Thompson         }
8710f54b25eSjeremylt 
8720f54b25eSjeremylt       // Check for collocated gradient
87364d3f0c0Sjeremylt       useCollograd = useCollograd && basis_data->d_collograd1d;
8741da99368SJeremy L Thompson     }
8751da99368SJeremy L Thompson   }
8761da99368SJeremy L Thompson   data->dim = dim;
8771da99368SJeremy L Thompson   data->Q1d = Q1d;
8781da99368SJeremy L Thompson 
8791da99368SJeremy L Thompson   // Define CEED_Q_VLA
88064d3f0c0Sjeremylt   if (dim != 3 || useCollograd) {
8811da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA 1\n\n";
8821da99368SJeremy L Thompson   } else {
8831da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
8841da99368SJeremy L Thompson   }
8851da99368SJeremy L Thompson 
886241a4b83SYohann   code << qFunction;
887241a4b83SYohann 
888241a4b83SYohann   // Setup
889818e0025SJeremy L Thompson   code << "\n// -----------------------------------------------------------------------------\n";
890c3c443faSYohann Dudouit   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
891241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
892241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
893e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8941da99368SJeremy L Thompson     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
895241a4b83SYohann       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
896241a4b83SYohann     }
897241a4b83SYohann   }
898241a4b83SYohann 
899241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
900241a4b83SYohann     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
901241a4b83SYohann   }
9021da99368SJeremy L Thompson 
903241a4b83SYohann   code << "  const CeedInt Dim = "<<dim<<";\n";
904241a4b83SYohann   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
9051da99368SJeremy L Thompson 
906241a4b83SYohann   code << "  extern __shared__ CeedScalar slice[];\n";
907241a4b83SYohann   code << "  BackendData data;\n";
908241a4b83SYohann   code << "  data.tidx = threadIdx.x;\n";
909241a4b83SYohann   code << "  data.tidy = threadIdx.y;\n";
910241a4b83SYohann   code << "  data.tidz = threadIdx.z;\n";
911241a4b83SYohann   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
91218d499f1SYohann   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
913920dcdc4Sjeremylt 
914818e0025SJeremy L Thompson   code << "\n  // -- Input field constants and basis data --\n";
915ac421f39SYohann   //Initialize constants, and matrices B and G
916241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
917818e0025SJeremy L Thompson     code << "  // ---- Input field "<<i<<" ----\n";
918241a4b83SYohann     // Get elemsize, emode, ncomp
919241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
920e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
921241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
922e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
923241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
924e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9254d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
926e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
927920dcdc4Sjeremylt 
928920dcdc4Sjeremylt     // Set field constants
929920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT) {
930e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
931920dcdc4Sjeremylt       if (basis != CEED_BASIS_COLLOCATED) {
932e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
933920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
934920dcdc4Sjeremylt       } else {
935920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
936920dcdc4Sjeremylt       }
937920dcdc4Sjeremylt       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
938920dcdc4Sjeremylt     }
939920dcdc4Sjeremylt 
940920dcdc4Sjeremylt     // Load basis data
941920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
942241a4b83SYohann     switch (emode) {
943241a4b83SYohann     case CEED_EVAL_NONE:
944241a4b83SYohann       break;
945241a4b83SYohann     case CEED_EVAL_INTERP:
946e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
947241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
94880a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
949241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
950241a4b83SYohann       break;
951241a4b83SYohann     case CEED_EVAL_GRAD:
952e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
953241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
95480a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
955241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
95664d3f0c0Sjeremylt       if (useCollograd) {
957ac421f39SYohann         data->G.in[i] = basis_data->d_collograd1d;
95880a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
959ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
960ac421f39SYohann       } else {
961ac421f39SYohann         data->G.in[i] = basis_data->d_grad1d;
96280a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
963241a4b83SYohann         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
964ac421f39SYohann       }
965241a4b83SYohann       break;
966241a4b83SYohann     case CEED_EVAL_WEIGHT:
967241a4b83SYohann       break; // No action
968241a4b83SYohann     case CEED_EVAL_DIV:
969241a4b83SYohann       break; // TODO: Not implemented
970241a4b83SYohann     case CEED_EVAL_CURL:
971241a4b83SYohann       break; // TODO: Not implemented
972241a4b83SYohann     }
973241a4b83SYohann   }
974920dcdc4Sjeremylt 
975818e0025SJeremy L Thompson   code << "\n  // -- Output field constants and basis data --\n";
976241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
977818e0025SJeremy L Thompson     code << "  // ---- Output field "<<i<<" ----\n";
978241a4b83SYohann     // Get elemsize, emode, ncomp
979241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
980e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
981241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
982e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
983241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
984e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9854d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
986e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
987920dcdc4Sjeremylt 
988920dcdc4Sjeremylt     // Set field constants
989e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
990920dcdc4Sjeremylt     if (basis != CEED_BASIS_COLLOCATED) {
991e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
992241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
993920dcdc4Sjeremylt     } else {
994920dcdc4Sjeremylt       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
995920dcdc4Sjeremylt     }
996920dcdc4Sjeremylt     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
997920dcdc4Sjeremylt 
998920dcdc4Sjeremylt     // Load basis data
999920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1000920dcdc4Sjeremylt     switch (emode) {
1001920dcdc4Sjeremylt     case CEED_EVAL_NONE:
1002920dcdc4Sjeremylt       break; // No action
1003920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
1004e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1005241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
100680a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1007241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1008241a4b83SYohann       break;
1009241a4b83SYohann     case CEED_EVAL_GRAD:
1010e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1011241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
101280a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1013241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
101464d3f0c0Sjeremylt       if (useCollograd) {
1015ac421f39SYohann         data->G.out[i] = basis_data->d_collograd1d;
101680a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1017ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1018ac421f39SYohann       } else {
1019ac421f39SYohann         data->G.out[i] = basis_data->d_grad1d;
102080a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1021241a4b83SYohann         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1022ac421f39SYohann       }
1023241a4b83SYohann       break;
1024e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
1025241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1026241a4b83SYohann       Ceed ceed;
1027e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1028e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
1029241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1030241a4b83SYohann       break; // Should not occur
1031241a4b83SYohann     }
1032241a4b83SYohann     case CEED_EVAL_DIV:
1033241a4b83SYohann       break; // TODO: Not implemented
1034241a4b83SYohann     case CEED_EVAL_CURL:
1035241a4b83SYohann       break; // TODO: Not implemented
1036e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
1037241a4b83SYohann     }
1038241a4b83SYohann   }
1039818e0025SJeremy L Thompson   code << "\n  // -- Element loop --\n";
1040ac421f39SYohann   code << "  __syncthreads();\n";
1041241a4b83SYohann   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1042241a4b83SYohann   // Input basis apply if needed
1043ac421f39SYohann   // Generate the correct eval mode code for each input
1044818e0025SJeremy L Thompson   code << "    // -- Input field restrictions and basis actions --\n";
1045241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1046818e0025SJeremy L Thompson     code << "    // ---- Input field "<<i<<" ----\n";
1047241a4b83SYohann     // Get elemsize, emode, ncomp
1048241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1049e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1050241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1051e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1052241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1053e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10544d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1055e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1056920dcdc4Sjeremylt 
1057920dcdc4Sjeremylt     // Restriction
1058920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT &&
105964d3f0c0Sjeremylt         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1060241a4b83SYohann       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
10619a2291e3SJeremy L Thompson 
10629a2291e3SJeremy L Thompson       bool isStrided;
1063e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
10649a2291e3SJeremy L Thompson       if (!isStrided) {
10655c7b696cSjeremylt         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1066e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10675c7b696cSjeremylt         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
10685c7b696cSjeremylt         CeedInt compstride;
1069e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
10705c7b696cSjeremylt         code << "    // CompStride: "<<compstride<<"\n";
1071e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
10729a2291e3SJeremy L Thompson         data->indices.in[i] = restr_data->d_ind;
10735c7b696cSjeremylt         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1074ccedf6b0Sjeremylt       } else {
1075920dcdc4Sjeremylt         bool backendstrides;
1076fd364f38SJeremy L Thompson         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1077e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
107813585805SJeremy L Thompson         CeedInt nelem;
107913585805SJeremy L Thompson         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1080e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
108113585805SJeremy L Thompson         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1082920dcdc4Sjeremylt         if (!backendstrides) {
1083920dcdc4Sjeremylt           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1084e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
1085ccedf6b0Sjeremylt         }
1086920dcdc4Sjeremylt         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1087d80fc06aSjeremylt         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1088920dcdc4Sjeremylt       }
1089920dcdc4Sjeremylt     }
1090920dcdc4Sjeremylt 
1091920dcdc4Sjeremylt     // Basis action
1092920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1093920dcdc4Sjeremylt     switch (emode) {
1094920dcdc4Sjeremylt     case CEED_EVAL_NONE:
109564d3f0c0Sjeremylt       if (!useCollograd) {
1096920dcdc4Sjeremylt         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1097920dcdc4Sjeremylt       }
1098920dcdc4Sjeremylt       break;
1099920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
1100241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1101241a4b83SYohann       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1102241a4b83SYohann       break;
1103241a4b83SYohann     case CEED_EVAL_GRAD:
110464d3f0c0Sjeremylt       if (useCollograd) {
1105ac421f39SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1106ac421f39SYohann         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1107ac421f39SYohann       } else {
1108241a4b83SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1109241a4b83SYohann         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1110ac421f39SYohann       }
1111241a4b83SYohann       break;
1112241a4b83SYohann     case CEED_EVAL_WEIGHT:
1113241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1114e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1115e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1116241a4b83SYohann       data->W = basis_data->d_qweight1d;
1117241a4b83SYohann       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1118241a4b83SYohann       break; // No action
1119241a4b83SYohann     case CEED_EVAL_DIV:
1120241a4b83SYohann       break; // TODO: Not implemented
1121241a4b83SYohann     case CEED_EVAL_CURL:
1122241a4b83SYohann       break; // TODO: Not implemented
1123241a4b83SYohann     }
1124241a4b83SYohann   }
1125ac421f39SYohann 
1126241a4b83SYohann   // Q function
1127818e0025SJeremy L Thompson   code << "\n    // -- Output field setup --\n";
1128241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1129818e0025SJeremy L Thompson       code << "\n    // ---- Output field "<<i<<" ----\n";
1130241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1131e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1132241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
1133241a4b83SYohann     {
113464d3f0c0Sjeremylt       if (useCollograd) {
1135ac421f39SYohann         //Accumulator for gradient slices
1136ac421f39SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1137ac421f39SYohann         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1138ac421f39SYohann         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1139ac421f39SYohann         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1140ac421f39SYohann         code << "      }\n";
1141ac421f39SYohann         code << "    }\n";
1142ac421f39SYohann       } else {
1143241a4b83SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1144241a4b83SYohann       }
1145ac421f39SYohann     }
1146241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1147241a4b83SYohann     {
1148241a4b83SYohann       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1149241a4b83SYohann     }
1150241a4b83SYohann   }
1151ac421f39SYohann   // We treat quadrature points per slice in 3d to save registers
115264d3f0c0Sjeremylt   if (useCollograd) {
1153920dcdc4Sjeremylt     code << "\n    // Note: Collocated Gradient\n";
1154ac421f39SYohann     code << "#pragma unroll\n";
1155ac421f39SYohann     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1156818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1157ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1158818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1159ac421f39SYohann       // Get elemsize, emode, ncomp
1160ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1161e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1162ac421f39SYohann       // Basis action
1163920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1164ac421f39SYohann       switch (emode) {
1165ac421f39SYohann       case CEED_EVAL_NONE:
1166ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11679a2291e3SJeremy L Thompson 
11689a2291e3SJeremy L Thompson         bool isStrided;
1169e15f9bd0SJeremy L Thompson         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1170e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1171e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
11729a2291e3SJeremy L Thompson         if (!isStrided) {
11735c7b696cSjeremylt           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1174e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11755c7b696cSjeremylt           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
11765c7b696cSjeremylt           CeedInt compstride;
1177e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
11785c7b696cSjeremylt           code << "      // CompStride: "<<compstride<<"\n";
1179e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
11809a2291e3SJeremy L Thompson           data->indices.in[i] = restr_data->d_ind;
11815c7b696cSjeremylt           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1182920dcdc4Sjeremylt         } else {
1183920dcdc4Sjeremylt           bool backendstrides;
1184fd364f38SJeremy L Thompson           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1185e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
118613585805SJeremy L Thompson           CeedInt nelem;
118713585805SJeremy L Thompson           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1188e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
118913585805SJeremy L Thompson           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1190920dcdc4Sjeremylt           if (!backendstrides) {
1191920dcdc4Sjeremylt             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1192e15f9bd0SJeremy L Thompson             CeedChkBackend(ierr);
1193920dcdc4Sjeremylt           }
1194920dcdc4Sjeremylt           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1195d80fc06aSjeremylt           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1196920dcdc4Sjeremylt         }
1197ac421f39SYohann         break;
1198ac421f39SYohann       case CEED_EVAL_INTERP:
1199ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1200ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1201ac421f39SYohann         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1202ac421f39SYohann         code << "      }\n";
1203ac421f39SYohann         break;
1204ac421f39SYohann       case CEED_EVAL_GRAD:
1205ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1206ac421f39SYohann         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1207ac421f39SYohann         break;
1208ac421f39SYohann       case CEED_EVAL_WEIGHT:
1209ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[1];\n";
1210ac421f39SYohann         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1211ac421f39SYohann         break; // No action
1212ac421f39SYohann       case CEED_EVAL_DIV:
1213ac421f39SYohann         break; // TODO: Not implemented
1214ac421f39SYohann       case CEED_EVAL_CURL:
1215ac421f39SYohann         break; // TODO: Not implemented
1216ac421f39SYohann       }
1217ac421f39SYohann     }
1218818e0025SJeremy L Thompson     code << "\n      // -- Output fields --\n";
1219ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1220818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1221ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1222e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1223ac421f39SYohann       // Basis action
1224ac421f39SYohann       switch (emode) {
1225ac421f39SYohann       case CEED_EVAL_NONE:
1226ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1227ac421f39SYohann         break; // No action
1228ac421f39SYohann       case CEED_EVAL_INTERP:
1229ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1230ac421f39SYohann         break;
1231ac421f39SYohann       case CEED_EVAL_GRAD:
1232ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1233ac421f39SYohann         break;
1234ac421f39SYohann       case CEED_EVAL_WEIGHT:
1235ac421f39SYohann         break; // Should not occur
1236ac421f39SYohann       case CEED_EVAL_DIV:
1237ac421f39SYohann         break; // TODO: Not implemented
1238ac421f39SYohann       case CEED_EVAL_CURL:
1239ac421f39SYohann         break; // TODO: Not implemented
1240ac421f39SYohann       }
1241ac421f39SYohann     }
1242ac421f39SYohann   } else {
1243920dcdc4Sjeremylt     code << "\n      // Note: No Collocated Gradient\n";
1244818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1245ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1246818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1247ac421f39SYohann       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1248ac421f39SYohann     }
1249818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1250ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1251818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1252ac421f39SYohann       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1253ac421f39SYohann     }
1254ac421f39SYohann   }
1255818e0025SJeremy L Thompson   code << "\n      // -- QFunction Inputs and outputs --\n";
12564d537eeaSYohann   code << "      CeedScalar* in["<<numinputfields<<"];\n";
12574d537eeaSYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1258818e0025SJeremy L Thompson     code << "      // ---- Input field "<<i<<" ----\n";
1259ac421f39SYohann     code << "      in["<<i<<"] = r_q"<<i<<";\n";
12604d537eeaSYohann   }
12614d537eeaSYohann   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
12624d537eeaSYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1263818e0025SJeremy L Thompson     code << "      // ---- Output field "<<i<<" ----\n";
1264ac421f39SYohann     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
12654d537eeaSYohann   }
1266818e0025SJeremy L Thompson   code << "\n      // -- Apply QFunction --\n";
1267ac421f39SYohann   code << "      "<<qFunctionName<<"(ctx, ";
126864d3f0c0Sjeremylt   if (dim != 3 || useCollograd) {
1269ac421f39SYohann     code << "1";
1270ac421f39SYohann   } else {
1271ac421f39SYohann     code << "Q1d";
1272ac421f39SYohann   }
1273ac421f39SYohann   code << ", in, out);\n";
127464d3f0c0Sjeremylt   if (useCollograd) {
1275920dcdc4Sjeremylt     code << "\n      // Note: Collocated Gradient\n";
1276818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1277ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1278818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1279ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1280e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1281ac421f39SYohann       // Basis action
1282920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1283ac421f39SYohann       switch (emode) {
1284ac421f39SYohann       case CEED_EVAL_NONE:
1285ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1286ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1287ac421f39SYohann         code << "      }\n";
1288ac421f39SYohann         break; // No action
1289ac421f39SYohann       case CEED_EVAL_INTERP:
1290ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1291ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1292ac421f39SYohann         code << "      }\n";
1293ac421f39SYohann         break;
1294ac421f39SYohann       case CEED_EVAL_GRAD:
1295ac421f39SYohann         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1296ac421f39SYohann         break;
1297ac421f39SYohann       case CEED_EVAL_WEIGHT:
1298ac421f39SYohann         break; // Should not occur
1299ac421f39SYohann       case CEED_EVAL_DIV:
1300ac421f39SYohann         break; // TODO: Not implemented
1301ac421f39SYohann       case CEED_EVAL_CURL:
1302ac421f39SYohann         break; // TODO: Not implemented
1303ac421f39SYohann       }
1304ac421f39SYohann     }
1305ac421f39SYohann     code << "    }\n";
1306ac421f39SYohann   }
1307241a4b83SYohann 
1308241a4b83SYohann   // Output basis apply if needed
1309ac421f39SYohann   // Generate the correct eval mode code for each output
1310818e0025SJeremy L Thompson   code << "\n    // -- Output field basis action and restrictions --\n";
1311241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1312818e0025SJeremy L Thompson     code << "    // ---- Output field "<<i<<" ----\n";
1313241a4b83SYohann     // Get elemsize, emode, ncomp
1314241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1315e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1316241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1317e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1318241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1319e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13204d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1321e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1322241a4b83SYohann     // Basis action
1323920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1324241a4b83SYohann     switch (emode) {
1325241a4b83SYohann     case CEED_EVAL_NONE:
1326920dcdc4Sjeremylt       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1327241a4b83SYohann       break; // No action
1328241a4b83SYohann     case CEED_EVAL_INTERP:
1329241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1330241a4b83SYohann       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1331241a4b83SYohann       break;
1332241a4b83SYohann     case CEED_EVAL_GRAD:
1333241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
133464d3f0c0Sjeremylt       if (useCollograd) {
1335ac421f39SYohann         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1336ac421f39SYohann       } else {
1337241a4b83SYohann         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1338ac421f39SYohann       }
1339241a4b83SYohann       break;
1340e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
1341241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1342241a4b83SYohann       Ceed ceed;
1343e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1344e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
1345241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1346241a4b83SYohann       break; // Should not occur
1347241a4b83SYohann     }
1348241a4b83SYohann     case CEED_EVAL_DIV:
1349241a4b83SYohann       break; // TODO: Not implemented
1350241a4b83SYohann     case CEED_EVAL_CURL:
1351241a4b83SYohann       break; // TODO: Not implemented
1352e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
1353241a4b83SYohann     }
1354920dcdc4Sjeremylt     // Restriction
13559a2291e3SJeremy L Thompson       bool isStrided;
1356e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
13579a2291e3SJeremy L Thompson     if (!isStrided) {
13585c7b696cSjeremylt       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1359e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13605c7b696cSjeremylt       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
13615c7b696cSjeremylt       CeedInt compstride;
1362e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
13635c7b696cSjeremylt       code << "    // CompStride: "<<compstride<<"\n";
1364e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
13659a2291e3SJeremy L Thompson       data->indices.out[i] = restr_data->d_ind;
13665c7b696cSjeremylt       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1367920dcdc4Sjeremylt     } else {
1368920dcdc4Sjeremylt       bool backendstrides;
1369fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1370e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
137113585805SJeremy L Thompson       CeedInt nelem;
137213585805SJeremy L Thompson       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1373e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
137413585805SJeremy L Thompson       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1375920dcdc4Sjeremylt       if (!backendstrides) {
1376920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1377e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
1378920dcdc4Sjeremylt       }
1379920dcdc4Sjeremylt       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1380d80fc06aSjeremylt       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1381920dcdc4Sjeremylt     }
1382241a4b83SYohann   }
1383241a4b83SYohann 
1384241a4b83SYohann   code << "  }\n";
1385818e0025SJeremy L Thompson   code << "}\n";
1386818e0025SJeremy L Thompson   code << "// -----------------------------------------------------------------------------\n\n";
1387241a4b83SYohann 
1388ab213215SJeremy L Thompson   // View kernel for debugging
13893f21f6b1SJeremy L Thompson   CeedDebug(ceed, code.str().c_str());
1390241a4b83SYohann 
139118d499f1SYohann   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
139218d499f1SYohann                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1393e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
1394c3c443faSYohann Dudouit   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1395e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
1396241a4b83SYohann 
1397e15f9bd0SJeremy L Thompson   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1398e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
1399241a4b83SYohann }
1400ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1401