xref: /libCEED/rust/libceed-sys/c-src/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 18d499f16311f0272db1a57764883905102e8c42)
1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4241a4b83SYohann //
5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7241a4b83SYohann // element discretizations for exascale applications. For more information and
8241a4b83SYohann // source code availability see http://github.com/ceed.
9241a4b83SYohann //
10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
16b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12
177df94212SJeremy L Thompson 
18241a4b83SYohann #include "ceed-cuda-gen.h"
19241a4b83SYohann #include <iostream>
20241a4b83SYohann #include <sstream>
21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h"
22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
23241a4b83SYohann 
24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
25ab213215SJeremy L Thompson //------------------------------------------------------------------------------
26ab213215SJeremy L Thompson // Atomic add, for older CUDA
27ab213215SJeremy L Thompson //------------------------------------------------------------------------------
28241a4b83SYohann __device__ double atomicAdd(double *address, double val) {
29241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
30241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
31241a4b83SYohann   do {
32241a4b83SYohann     assumed = old;
33241a4b83SYohann     old =
34241a4b83SYohann       atomicCAS(address_as_ull, assumed,
35241a4b83SYohann                 __double_as_longlong(val +
36241a4b83SYohann                                      __longlong_as_double(assumed)));
37241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
38241a4b83SYohann     // (since NaN != NaN)
39241a4b83SYohann   } while (assumed != old);
40241a4b83SYohann   return __longlong_as_double(old);
41241a4b83SYohann }
42f1a13f77SYohann Dudouit );
43f1a13f77SYohann Dudouit 
44f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
45f1a13f77SYohann Dudouit 
46ab213215SJeremy L Thompson //------------------------------------------------------------------------------
47ab213215SJeremy L Thompson // Typedefs
48ab213215SJeremy L Thompson //------------------------------------------------------------------------------
49f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
50f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
51f1a13f77SYohann Dudouit 
52f1a13f77SYohann Dudouit typedef struct {
53f1a13f77SYohann Dudouit   CeedInt tidx;
54f1a13f77SYohann Dudouit   CeedInt tidy;
55f1a13f77SYohann Dudouit   CeedInt tidz;
56f1a13f77SYohann Dudouit   CeedInt tid;
57f1a13f77SYohann Dudouit   CeedScalar* slice;
58f1a13f77SYohann Dudouit } BackendData;
59241a4b83SYohann 
60ab213215SJeremy L Thompson //------------------------------------------------------------------------------
61ab213215SJeremy L Thompson // Load matrices for basis actions
62ab213215SJeremy L Thompson //------------------------------------------------------------------------------
63241a4b83SYohann template <int P, int Q>
64241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
65ab213215SJeremy L Thompson   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
66241a4b83SYohann     B[i] = d_B[i];
67241a4b83SYohann }
68241a4b83SYohann 
69ab213215SJeremy L Thompson //------------------------------------------------------------------------------
70241a4b83SYohann // 1D
71ab213215SJeremy L Thompson //------------------------------------------------------------------------------
72ab213215SJeremy L Thompson 
73ab213215SJeremy L Thompson //------------------------------------------------------------------------------
74ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
75ab213215SJeremy L Thompson //------------------------------------------------------------------------------
765c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
775c7b696cSjeremylt inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
78ab213215SJeremy L Thompson   if (data.tidx < P1d) {
794d537eeaSYohann     const CeedInt node = data.tidx;
80ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
81ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
825c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
83241a4b83SYohann   }
84241a4b83SYohann }
85920dcdc4Sjeremylt 
86ab213215SJeremy L Thompson //------------------------------------------------------------------------------
87ab213215SJeremy L Thompson // L-vector -> E-vector, strided
88ab213215SJeremy L Thompson //------------------------------------------------------------------------------
89d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
90d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
91ab213215SJeremy L Thompson   if (data.tidx < P1d) {
92ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
93d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
94ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
95d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
96ccedf6b0Sjeremylt   }
97ccedf6b0Sjeremylt }
98241a4b83SYohann 
99ab213215SJeremy L Thompson //------------------------------------------------------------------------------
100ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
101ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1025c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
1035c7b696cSjeremylt inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
104ab213215SJeremy L Thompson   if (data.tidx < P1d) {
1054d537eeaSYohann     const CeedInt node = data.tidx;
106ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
107ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1085c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
109241a4b83SYohann   }
110241a4b83SYohann }
111241a4b83SYohann 
112ab213215SJeremy L Thompson //------------------------------------------------------------------------------
113ab213215SJeremy L Thompson // E-vector -> L-vector, strided
114ab213215SJeremy L Thompson //------------------------------------------------------------------------------
115d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
116d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
117ab213215SJeremy L Thompson   if (data.tidx < P1d) {
118ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
119d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
120ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
121d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
122ccedf6b0Sjeremylt   }
123ccedf6b0Sjeremylt }
124ccedf6b0Sjeremylt 
125ab213215SJeremy L Thompson //------------------------------------------------------------------------------
126ab213215SJeremy L Thompson // 1D tensor contraction x
127ab213215SJeremy L Thompson //------------------------------------------------------------------------------
128241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
129ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130241a4b83SYohann   data.slice[data.tidx] = *U;
131241a4b83SYohann   __syncthreads();
132241a4b83SYohann   *V = 0.0;
133*18d499f1SYohann   if (data.tidx < Q1d)
134ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
135ab213215SJeremy L Thompson       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
136241a4b83SYohann   __syncthreads();
137241a4b83SYohann }
138241a4b83SYohann 
139ab213215SJeremy L Thompson //------------------------------------------------------------------------------
140ab213215SJeremy L Thompson // 1D transpose tensor contraction x
141ab213215SJeremy L Thompson //------------------------------------------------------------------------------
142241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
143ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
144241a4b83SYohann   data.slice[data.tidx] = *U;
145241a4b83SYohann   __syncthreads();
146241a4b83SYohann   *V = 0.0;
147*18d499f1SYohann   if (data.tidx < P1d)
148ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
149ab213215SJeremy L Thompson       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
150241a4b83SYohann   __syncthreads();
151241a4b83SYohann }
152241a4b83SYohann 
153ab213215SJeremy L Thompson //------------------------------------------------------------------------------
154ab213215SJeremy L Thompson // 1D interpolate to quadrature points
155ab213215SJeremy L Thompson //------------------------------------------------------------------------------
156241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
157ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
158ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
159241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
160241a4b83SYohann }
161241a4b83SYohann 
162ab213215SJeremy L Thompson //------------------------------------------------------------------------------
163ab213215SJeremy L Thompson // 1D interpolate transpose
164ab213215SJeremy L Thompson //------------------------------------------------------------------------------
165241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
166ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
167ab213215SJeremy L Thompson   for (CeedInt comp=0; comp<NCOMP; comp++)
168241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
169241a4b83SYohann }
170241a4b83SYohann 
171ab213215SJeremy L Thompson //------------------------------------------------------------------------------
172ab213215SJeremy L Thompson // 1D derivatives at quadrature points
173ab213215SJeremy L Thompson //------------------------------------------------------------------------------
174241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
175ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
176ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
177241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
178241a4b83SYohann }
179241a4b83SYohann 
180ab213215SJeremy L Thompson //------------------------------------------------------------------------------
181ab213215SJeremy L Thompson // 1D derivatives transpose
182ab213215SJeremy L Thompson //------------------------------------------------------------------------------
183241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
184ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
185ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
186241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
187241a4b83SYohann }
188241a4b83SYohann 
189ab213215SJeremy L Thompson //------------------------------------------------------------------------------
190241a4b83SYohann // 2D
191ab213215SJeremy L Thompson //------------------------------------------------------------------------------
192ab213215SJeremy L Thompson 
193ab213215SJeremy L Thompson //------------------------------------------------------------------------------
194ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
195ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1965c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
1975c7b696cSjeremylt inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
198ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
1994d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
200ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
201ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2025c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
203241a4b83SYohann   }
204241a4b83SYohann }
205920dcdc4Sjeremylt 
206ab213215SJeremy L Thompson //------------------------------------------------------------------------------
207ab213215SJeremy L Thompson // L-vector -> E-vector, strided
208ab213215SJeremy L Thompson //------------------------------------------------------------------------------
209d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
210d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
211ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
212ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
213d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
214ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
215d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
216ccedf6b0Sjeremylt   }
217ccedf6b0Sjeremylt }
218241a4b83SYohann 
219ab213215SJeremy L Thompson //------------------------------------------------------------------------------
220ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
221ab213215SJeremy L Thompson //------------------------------------------------------------------------------
2225c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
2235c7b696cSjeremylt inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
224ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
2254d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
226ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
227ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2285c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
229241a4b83SYohann   }
230241a4b83SYohann }
231241a4b83SYohann 
232ab213215SJeremy L Thompson //------------------------------------------------------------------------------
233ab213215SJeremy L Thompson // E-vector -> L-vector, strided
234ab213215SJeremy L Thompson //------------------------------------------------------------------------------
235d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
236d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
237ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
238ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
239d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
240ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
241d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
242ccedf6b0Sjeremylt   }
243ccedf6b0Sjeremylt }
244ccedf6b0Sjeremylt 
245ab213215SJeremy L Thompson //------------------------------------------------------------------------------
246ab213215SJeremy L Thompson // 2D tensor contraction x
247ab213215SJeremy L Thompson //------------------------------------------------------------------------------
248241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
249ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
250*18d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
251241a4b83SYohann   __syncthreads();
252241a4b83SYohann   *V = 0.0;
253*18d499f1SYohann   if (data.tidx < Q1d && data.tidy < P1d)
254ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
255*18d499f1SYohann       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
256241a4b83SYohann   __syncthreads();
257241a4b83SYohann }
258241a4b83SYohann 
259ab213215SJeremy L Thompson //------------------------------------------------------------------------------
260ab213215SJeremy L Thompson // 2D tensor contract y
261ab213215SJeremy L Thompson //------------------------------------------------------------------------------
262241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
263ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
264*18d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
265241a4b83SYohann   __syncthreads();
266241a4b83SYohann   *V = 0.0;
267*18d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d)
268ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
269*18d499f1SYohann       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
270241a4b83SYohann   __syncthreads();
271241a4b83SYohann }
272241a4b83SYohann 
273ab213215SJeremy L Thompson //------------------------------------------------------------------------------
274ab213215SJeremy L Thompson // 2D transpose tensor contract y
275ab213215SJeremy L Thompson //------------------------------------------------------------------------------
276241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
277ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
278*18d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
279241a4b83SYohann   __syncthreads();
280241a4b83SYohann   *V = 0.0;
281*18d499f1SYohann   if (data.tidx < Q1d && data.tidy < P1d)
282ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
283*18d499f1SYohann       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
284241a4b83SYohann   __syncthreads();
285241a4b83SYohann }
286241a4b83SYohann 
287ab213215SJeremy L Thompson //------------------------------------------------------------------------------
288ab213215SJeremy L Thompson // 2D transpose tensor contract x
289ab213215SJeremy L Thompson //------------------------------------------------------------------------------
290241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
291ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
292*18d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
293241a4b83SYohann   __syncthreads();
294241a4b83SYohann   *V = 0.0;
295*18d499f1SYohann   if (data.tidx < P1d && data.tidy < P1d)
296ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
297*18d499f1SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
298241a4b83SYohann   __syncthreads();
299241a4b83SYohann }
300241a4b83SYohann 
301ab213215SJeremy L Thompson //------------------------------------------------------------------------------
302ab213215SJeremy L Thompson // 2D transpose tensor contract and add x
303ab213215SJeremy L Thompson //------------------------------------------------------------------------------
304241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
305ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
306*18d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
307241a4b83SYohann   __syncthreads();
308*18d499f1SYohann   if (data.tidx < P1d && data.tidy < P1d)
309ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
310*18d499f1SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
311241a4b83SYohann   __syncthreads();
312241a4b83SYohann }
313241a4b83SYohann 
314ab213215SJeremy L Thompson //------------------------------------------------------------------------------
315ab213215SJeremy L Thompson // 2D interpolate to quadrature points
316ab213215SJeremy L Thompson //------------------------------------------------------------------------------
317241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
318ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
319241a4b83SYohann   CeedScalar r_t[1];
320ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
321241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
322241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
323241a4b83SYohann   }
324241a4b83SYohann }
325241a4b83SYohann 
326ab213215SJeremy L Thompson //------------------------------------------------------------------------------
327ab213215SJeremy L Thompson // 2D interpolate transpose
328ab213215SJeremy L Thompson //------------------------------------------------------------------------------
329241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
330ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
331241a4b83SYohann   CeedScalar r_t[1];
332ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
333241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
334241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
335241a4b83SYohann   }
336241a4b83SYohann }
337241a4b83SYohann 
338ab213215SJeremy L Thompson //------------------------------------------------------------------------------
339ab213215SJeremy L Thompson // 2D derivatives at quadrature points
340ab213215SJeremy L Thompson //------------------------------------------------------------------------------
341241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
342ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
343241a4b83SYohann   CeedScalar r_t[1];
344ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
345241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
346241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
347241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
348241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
349241a4b83SYohann   }
350241a4b83SYohann }
351241a4b83SYohann 
352ab213215SJeremy L Thompson //------------------------------------------------------------------------------
353ab213215SJeremy L Thompson // 2D derivatives transpose
354ab213215SJeremy L Thompson //------------------------------------------------------------------------------
355241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
356ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
357241a4b83SYohann   CeedScalar r_t[1];
358ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
359241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
360241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
361241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
362241a4b83SYohann     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
363241a4b83SYohann   }
364241a4b83SYohann }
365241a4b83SYohann 
366ab213215SJeremy L Thompson //------------------------------------------------------------------------------
367241a4b83SYohann // 3D
368ab213215SJeremy L Thompson //------------------------------------------------------------------------------
369ab213215SJeremy L Thompson 
370ab213215SJeremy L Thompson //------------------------------------------------------------------------------
371ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
372ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3735c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
3745c7b696cSjeremylt inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
375ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
376241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
3774d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
378ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
379ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3805c7b696cSjeremylt         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
381241a4b83SYohann     }
382241a4b83SYohann }
383920dcdc4Sjeremylt 
384ab213215SJeremy L Thompson //------------------------------------------------------------------------------
385ab213215SJeremy L Thompson // L-vector -> E-vector, strided
386ab213215SJeremy L Thompson //------------------------------------------------------------------------------
387d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
388d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
389ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
390ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
391ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
392d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
393ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
394d80fc06aSjeremylt         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
395ccedf6b0Sjeremylt     }
396ccedf6b0Sjeremylt }
397241a4b83SYohann 
398ab213215SJeremy L Thompson //------------------------------------------------------------------------------
399ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided
400ab213215SJeremy L Thompson //------------------------------------------------------------------------------
4015c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d>
4025c7b696cSjeremylt inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
403*18d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
404ac421f39SYohann     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
405920dcdc4Sjeremylt     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
406ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4075c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
408ac421f39SYohann   }
409*18d499f1SYohann }
410ac421f39SYohann 
411ab213215SJeremy L Thompson //------------------------------------------------------------------------------
412ab213215SJeremy L Thompson // E-vector -> Q-vector, strided
413ab213215SJeremy L Thompson //------------------------------------------------------------------------------
414d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
41525dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
416*18d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
417920dcdc4Sjeremylt     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
418d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
419ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
420d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
421920dcdc4Sjeremylt   }
422*18d499f1SYohann }
423920dcdc4Sjeremylt 
424ab213215SJeremy L Thompson //------------------------------------------------------------------------------
425ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
426ab213215SJeremy L Thompson //------------------------------------------------------------------------------
4275c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
4285c7b696cSjeremylt inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
429ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
430241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4314d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
432ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
433ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4345c7b696cSjeremylt         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
435241a4b83SYohann     }
436241a4b83SYohann }
437241a4b83SYohann 
438ab213215SJeremy L Thompson //------------------------------------------------------------------------------
439ab213215SJeremy L Thompson // E-vector -> L-vector, strided
440ab213215SJeremy L Thompson //------------------------------------------------------------------------------
441d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4422f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
443ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
444ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
445ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
446d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
447ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
448d80fc06aSjeremylt         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
449ccedf6b0Sjeremylt     }
450ccedf6b0Sjeremylt }
451ccedf6b0Sjeremylt 
452ab213215SJeremy L Thompson //------------------------------------------------------------------------------
453ab213215SJeremy L Thompson // 3D tensor contract x
454ab213215SJeremy L Thompson //------------------------------------------------------------------------------
455241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
456ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
457ac421f39SYohann   CeedScalar r_B[P1d];
458ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
459ac421f39SYohann     r_B[i] = B[i + data.tidx*P1d];
460ab213215SJeremy L Thompson 
461ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
462*18d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
463241a4b83SYohann     __syncthreads();
464241a4b83SYohann     V[k] = 0.0;
465*18d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
466ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
467*18d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
468241a4b83SYohann     __syncthreads();
469241a4b83SYohann   }
470241a4b83SYohann }
471241a4b83SYohann 
472ab213215SJeremy L Thompson //------------------------------------------------------------------------------
473ab213215SJeremy L Thompson // 3D tensor contract y
474ab213215SJeremy L Thompson //------------------------------------------------------------------------------
475241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
476ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
477ac421f39SYohann   CeedScalar r_B[P1d];
478ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
479ac421f39SYohann     r_B[i] = B[i + data.tidy*P1d];
480ab213215SJeremy L Thompson 
481ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
482*18d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
483241a4b83SYohann     __syncthreads();
484241a4b83SYohann     V[k] = 0.0;
485*18d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
486ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
487*18d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
488241a4b83SYohann     __syncthreads();
489241a4b83SYohann   }
490241a4b83SYohann }
491241a4b83SYohann 
492ab213215SJeremy L Thompson //------------------------------------------------------------------------------
493ab213215SJeremy L Thompson // 3D tensor contract z
494ab213215SJeremy L Thompson //------------------------------------------------------------------------------
495241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
496ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
497ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
498241a4b83SYohann     V[k] = 0.0;
499*18d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
500ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
501ab213215SJeremy L Thompson         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
502241a4b83SYohann   }
503241a4b83SYohann }
504241a4b83SYohann 
505ab213215SJeremy L Thompson //------------------------------------------------------------------------------
506ab213215SJeremy L Thompson // 3D transpose tensor contract z
507ab213215SJeremy L Thompson //------------------------------------------------------------------------------
508241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
509ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
510*18d499f1SYohann   for (CeedInt k = 0; k < P1d; ++k) {
511241a4b83SYohann     V[k] = 0.0;
512*18d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
513ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
514ab213215SJeremy L Thompson         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
515241a4b83SYohann   }
516241a4b83SYohann }
517241a4b83SYohann 
518ab213215SJeremy L Thompson //------------------------------------------------------------------------------
519ab213215SJeremy L Thompson // 3D transpose tensor contract y
520ab213215SJeremy L Thompson //------------------------------------------------------------------------------
521241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
522ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
523ac421f39SYohann   CeedScalar r_B[Q1d];
524ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i)
525ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
526ab213215SJeremy L Thompson 
527ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
528*18d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
529241a4b83SYohann     __syncthreads();
530241a4b83SYohann     V[k] = 0.0;
531*18d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
532ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
533*18d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
534ac421f39SYohann     __syncthreads();
535ac421f39SYohann   }
536ac421f39SYohann }
537ac421f39SYohann 
538ab213215SJeremy L Thompson //------------------------------------------------------------------------------
539ab213215SJeremy L Thompson // 3D transpose tensor contract add y
540ab213215SJeremy L Thompson //------------------------------------------------------------------------------
541ac421f39SYohann template <int NCOMP, int P1d, int Q1d>
542ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
543ac421f39SYohann   CeedScalar r_B[Q1d];
544ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
545ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
546ab213215SJeremy L Thompson 
547ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
548*18d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
549ac421f39SYohann     __syncthreads();
550*18d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
551ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
552*18d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
553241a4b83SYohann     __syncthreads();
554241a4b83SYohann   }
555241a4b83SYohann }
556241a4b83SYohann 
557ab213215SJeremy L Thompson //------------------------------------------------------------------------------
558ab213215SJeremy L Thompson // 3D transpose tensor contract x
559ab213215SJeremy L Thompson //------------------------------------------------------------------------------
560241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
561ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
562ac421f39SYohann   CeedScalar r_B[Q1d];
563ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
564ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
565ab213215SJeremy L Thompson 
566ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
567*18d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
568241a4b83SYohann     __syncthreads();
569241a4b83SYohann     V[k] = 0.0;
570*18d499f1SYohann     if (data.tidx < P1d && data.tidy < P1d)
571ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
572*18d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
573241a4b83SYohann     __syncthreads();
574241a4b83SYohann   }
575241a4b83SYohann }
576241a4b83SYohann 
577ab213215SJeremy L Thompson //------------------------------------------------------------------------------
578ab213215SJeremy L Thompson // 3D transpose tensor contract add x
579ab213215SJeremy L Thompson //------------------------------------------------------------------------------
580241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
581ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
582ac421f39SYohann   CeedScalar r_B[Q1d];
583ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
584ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
585ab213215SJeremy L Thompson 
586ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
587*18d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
588241a4b83SYohann     __syncthreads();
589*18d499f1SYohann     if (data.tidx < P1d && data.tidy < P1d)
590ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
591*18d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
592241a4b83SYohann     __syncthreads();
593241a4b83SYohann   }
594241a4b83SYohann }
595241a4b83SYohann 
596ab213215SJeremy L Thompson //------------------------------------------------------------------------------
597ab213215SJeremy L Thompson // 3D interpolate to quadrature points
598ab213215SJeremy L Thompson //------------------------------------------------------------------------------
599241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
600ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
601*18d499f1SYohann   CeedScalar r_t1[T1d];
602*18d499f1SYohann   CeedScalar r_t2[T1d];
603ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
604241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
605241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
606241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
607241a4b83SYohann   }
608241a4b83SYohann }
609241a4b83SYohann 
610ab213215SJeremy L Thompson //------------------------------------------------------------------------------
611ab213215SJeremy L Thompson // 3D interpolate transpose
612ab213215SJeremy L Thompson //------------------------------------------------------------------------------
613241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
614ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
615*18d499f1SYohann   CeedScalar r_t1[T1d];
616*18d499f1SYohann   CeedScalar r_t2[T1d];
617ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
618241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
619241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
620241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
621241a4b83SYohann   }
622241a4b83SYohann }
623241a4b83SYohann 
624ab213215SJeremy L Thompson //------------------------------------------------------------------------------
625ab213215SJeremy L Thompson // 3D derivatives at quadrature points
626ab213215SJeremy L Thompson //------------------------------------------------------------------------------
627241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
628ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
629*18d499f1SYohann   CeedScalar r_t1[T1d];
630*18d499f1SYohann   CeedScalar r_t2[T1d];
631ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
632241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
633241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
634241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
635241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
636241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
637241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
638241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
639241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
640241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
641241a4b83SYohann   }
642241a4b83SYohann }
643241a4b83SYohann 
644ab213215SJeremy L Thompson //------------------------------------------------------------------------------
645ab213215SJeremy L Thompson // 3D derivatives transpose
646ab213215SJeremy L Thompson //------------------------------------------------------------------------------
647241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
648ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
649*18d499f1SYohann   CeedScalar r_t1[T1d];
650*18d499f1SYohann   CeedScalar r_t2[T1d];
651ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
652241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
653241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
654241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
655241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
656241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
657241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
658241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
659241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
660241a4b83SYohann     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
661241a4b83SYohann   }
662241a4b83SYohann }
663241a4b83SYohann 
664ab213215SJeremy L Thompson //------------------------------------------------------------------------------
665ab213215SJeremy L Thompson // 3D collocated derivatives computation
666ab213215SJeremy L Thompson //------------------------------------------------------------------------------
667ac421f39SYohann template <int NCOMP, int Q1d>
668ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
669*18d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
670ac421f39SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
671*18d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
672ac421f39SYohann       __syncthreads();
673ac421f39SYohann       // X derivative
674ac421f39SYohann       r_V[comp+0*NCOMP] = 0.0;
675ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
676*18d499f1SYohann         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
677ac421f39SYohann       // Y derivative
678ac421f39SYohann       r_V[comp+1*NCOMP] = 0.0;
679ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
680*18d499f1SYohann         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
681ac421f39SYohann       // Z derivative
682ac421f39SYohann       r_V[comp+2*NCOMP] = 0.0;
683ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
684ab213215SJeremy L Thompson         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
685ac421f39SYohann       __syncthreads();
686ac421f39SYohann     }
687ac421f39SYohann   }
688*18d499f1SYohann }
689ac421f39SYohann 
690ab213215SJeremy L Thompson //------------------------------------------------------------------------------
691ab213215SJeremy L Thompson // 3D collocated derivatives transpose
692ab213215SJeremy L Thompson //------------------------------------------------------------------------------
693ac421f39SYohann template <int NCOMP, int Q1d>
694ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
695*18d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
696ac421f39SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
697ac421f39SYohann       // X derivative
698*18d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
699ac421f39SYohann       __syncthreads();
700ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
701*18d499f1SYohann         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
702ac421f39SYohann       __syncthreads();
703ac421f39SYohann       // Y derivative
704*18d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
705ac421f39SYohann       __syncthreads();
706ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
707*18d499f1SYohann         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
708ac421f39SYohann       __syncthreads();
709ac421f39SYohann       // Z derivative
710ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
711ac421f39SYohann         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
712ac421f39SYohann     }
713ac421f39SYohann   }
714*18d499f1SYohann }
715ac421f39SYohann 
716ab213215SJeremy L Thompson //------------------------------------------------------------------------------
717ab213215SJeremy L Thompson // 1D quadrature weights
718ab213215SJeremy L Thompson //------------------------------------------------------------------------------
719241a4b83SYohann template <int Q1d>
720241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
721*18d499f1SYohann   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
722241a4b83SYohann }
723241a4b83SYohann 
724ab213215SJeremy L Thompson //------------------------------------------------------------------------------
725ab213215SJeremy L Thompson // 2D quadrature weights
726ab213215SJeremy L Thompson //------------------------------------------------------------------------------
727241a4b83SYohann template <int Q1d>
728241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
729*18d499f1SYohann   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
730*18d499f1SYohann         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
731241a4b83SYohann }
732241a4b83SYohann 
733ab213215SJeremy L Thompson //------------------------------------------------------------------------------
734ab213215SJeremy L Thompson // 3D quadrature weights
735ab213215SJeremy L Thompson //------------------------------------------------------------------------------
736241a4b83SYohann template <int Q1d>
737241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
738*18d499f1SYohann   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
739*18d499f1SYohann   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
740ac421f39SYohann   for (CeedInt z = 0; z < Q1d; ++z)
741*18d499f1SYohann     w[z] = quad ? pw*qweight1d[z] : 0.0;
742241a4b83SYohann }
743241a4b83SYohann 
744241a4b83SYohann );
745ab213215SJeremy L Thompson //------------------------------------------------------------------------------
746ab213215SJeremy L Thompson // Build singe operator kernel
747ab213215SJeremy L Thompson //------------------------------------------------------------------------------
748241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
749241a4b83SYohann 
750241a4b83SYohann   using std::ostringstream;
751241a4b83SYohann   using std::string;
752241a4b83SYohann   int ierr;
753241a4b83SYohann   bool setupdone;
754fd364f38SJeremy L Thompson   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
755241a4b83SYohann   if (setupdone) return 0;
756241a4b83SYohann   Ceed ceed;
757241a4b83SYohann   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
758241a4b83SYohann   CeedOperator_Cuda_gen *data;
7596c845298Sjeremylt   ierr = CeedOperatorGetData(op, &data); CeedChk(ierr);
760241a4b83SYohann   CeedQFunction qf;
761241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
762241a4b83SYohann   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
7636c845298Sjeremylt   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr);
7641da99368SJeremy L Thompson   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
7655c7b696cSjeremylt           numoutputfields, ncomp, dim = 0, lsize;
766241a4b83SYohann   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
767241a4b83SYohann   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
768241a4b83SYohann   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
769241a4b83SYohann   CeedChk(ierr);
770241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
771241a4b83SYohann   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
772241a4b83SYohann   CeedChk(ierr);
773241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
774241a4b83SYohann   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
775241a4b83SYohann   CeedChk(ierr);
776241a4b83SYohann   CeedEvalMode emode;
777241a4b83SYohann   CeedBasis basis;
778241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
779241a4b83SYohann   CeedElemRestriction Erestrict;
780241a4b83SYohann   CeedElemRestriction_Cuda_reg *restr_data;
781241a4b83SYohann 
782241a4b83SYohann   ostringstream code;
783241a4b83SYohann   string devFunctions(deviceFunctions);
784241a4b83SYohann 
785f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
786f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
787f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
7886c845298Sjeremylt   ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr);
789f1a13f77SYohann Dudouit   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
790f1a13f77SYohann Dudouit   if (prop.major<6){
791f1a13f77SYohann Dudouit     code << atomicAdd;
792f1a13f77SYohann Dudouit   }
793f1a13f77SYohann Dudouit 
794241a4b83SYohann   code << devFunctions;
795241a4b83SYohann 
796241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
797c3c443faSYohann Dudouit   string qFunctionName(qf_data->qFunctionName);
798c3c443faSYohann Dudouit   string oper;
79914cce66cSYohann Dudouit   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
8004d537eeaSYohann 
8014d537eeaSYohann   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
802ee07ded2SValeria Barra   code << "\n#define CeedPragmaSIMD\n";
8031da99368SJeremy L Thompson 
8041da99368SJeremy L Thompson   // Find dim and Q1d
80564d3f0c0Sjeremylt   bool useCollograd = true;
806*18d499f1SYohann   data->maxP1d = 0;
8071da99368SJeremy L Thompson   for (CeedInt i = 0; i < numinputfields; i++) {
8081da99368SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
8091da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
8106c845298Sjeremylt       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
8110f54b25eSjeremylt       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
8120f54b25eSjeremylt       CeedChk(ierr);
8131da99368SJeremy L Thompson 
8141da99368SJeremy L Thompson       // Check for collocated gradient
815*18d499f1SYohann       useCollograd = useCollograd && basis_data->d_collograd1d;
8161da99368SJeremy L Thompson 
8171da99368SJeremy L Thompson       // Collect dim and Q1d
8181da99368SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
8191da99368SJeremy L Thompson       bool isTensor;
820fd364f38SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
8211da99368SJeremy L Thompson       if (isTensor) {
8221da99368SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
823*18d499f1SYohann         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
824*18d499f1SYohann         if (P1d>data->maxP1d) data->maxP1d = P1d;
8251da99368SJeremy L Thompson       } else {
826e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
8271da99368SJeremy L Thompson         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
828e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8291da99368SJeremy L Thompson         }
8301da99368SJeremy L Thompson     }
8311da99368SJeremy L Thompson   }
8321da99368SJeremy L Thompson   // Check output bases for Q1d, dim as well
8331da99368SJeremy L Thompson   //   The only imput basis might be CEED_BASIS_COLLOCATED
8341da99368SJeremy L Thompson   for (CeedInt i = 0; i < numoutputfields; i++) {
8351da99368SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
8360f54b25eSjeremylt 
8371da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
8386c845298Sjeremylt       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
8390f54b25eSjeremylt       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
8400f54b25eSjeremylt       CeedChk(ierr);
8410f54b25eSjeremylt 
8421da99368SJeremy L Thompson       // Collect dim and Q1d
8431da99368SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
8441da99368SJeremy L Thompson       bool isTensor;
845fd364f38SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
8461da99368SJeremy L Thompson       if (isTensor) {
8471da99368SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
8481da99368SJeremy L Thompson       } else {
849e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
8501da99368SJeremy L Thompson         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
851e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8521da99368SJeremy L Thompson         }
8530f54b25eSjeremylt 
8540f54b25eSjeremylt       // Check for collocated gradient
85564d3f0c0Sjeremylt       useCollograd = useCollograd && basis_data->d_collograd1d;
8561da99368SJeremy L Thompson     }
8571da99368SJeremy L Thompson   }
8581da99368SJeremy L Thompson   data->dim = dim;
8591da99368SJeremy L Thompson   data->Q1d = Q1d;
8601da99368SJeremy L Thompson 
8611da99368SJeremy L Thompson   // Define CEED_Q_VLA
86264d3f0c0Sjeremylt   if (dim != 3 || useCollograd) {
8631da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA 1\n\n";
8641da99368SJeremy L Thompson   } else {
8651da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
8661da99368SJeremy L Thompson   }
8671da99368SJeremy L Thompson 
868241a4b83SYohann   code << qFunction;
869241a4b83SYohann 
870241a4b83SYohann   // Setup
871818e0025SJeremy L Thompson   code << "\n// -----------------------------------------------------------------------------\n";
872c3c443faSYohann Dudouit   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
873241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
874241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
875241a4b83SYohann     CeedChk(ierr);
8761da99368SJeremy L Thompson     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
877241a4b83SYohann       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
878241a4b83SYohann     }
879241a4b83SYohann   }
880241a4b83SYohann 
881241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
882241a4b83SYohann     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
883241a4b83SYohann   }
8841da99368SJeremy L Thompson 
885241a4b83SYohann   code << "  const CeedInt Dim = "<<dim<<";\n";
886241a4b83SYohann   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
8871da99368SJeremy L Thompson 
888241a4b83SYohann   code << "  extern __shared__ CeedScalar slice[];\n";
889241a4b83SYohann   code << "  BackendData data;\n";
890241a4b83SYohann   code << "  data.tidx = threadIdx.x;\n";
891241a4b83SYohann   code << "  data.tidy = threadIdx.y;\n";
892241a4b83SYohann   code << "  data.tidz = threadIdx.z;\n";
893241a4b83SYohann   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
894*18d499f1SYohann   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
895920dcdc4Sjeremylt 
896818e0025SJeremy L Thompson   code << "\n  // -- Input field constants and basis data --\n";
897ac421f39SYohann   //Initialize constants, and matrices B and G
898241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
899818e0025SJeremy L Thompson     code << "  // ---- Input field "<<i<<" ----\n";
900241a4b83SYohann     // Get elemsize, emode, ncomp
901241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
902241a4b83SYohann     CeedChk(ierr);
903241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
904241a4b83SYohann     CeedChk(ierr);
905241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
906241a4b83SYohann     CeedChk(ierr);
9074d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
908241a4b83SYohann     CeedChk(ierr);
909920dcdc4Sjeremylt 
910920dcdc4Sjeremylt     // Set field constants
911920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT) {
912920dcdc4Sjeremylt       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
913920dcdc4Sjeremylt       if (basis != CEED_BASIS_COLLOCATED) {
914920dcdc4Sjeremylt         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
915920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
916920dcdc4Sjeremylt       } else {
917920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
918920dcdc4Sjeremylt       }
919920dcdc4Sjeremylt       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
920920dcdc4Sjeremylt     }
921920dcdc4Sjeremylt 
922920dcdc4Sjeremylt     // Load basis data
923920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
924241a4b83SYohann     switch (emode) {
925241a4b83SYohann     case CEED_EVAL_NONE:
926241a4b83SYohann       break;
927241a4b83SYohann     case CEED_EVAL_INTERP:
9286c845298Sjeremylt       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
929241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
930241a4b83SYohann       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
931241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
932241a4b83SYohann       break;
933241a4b83SYohann     case CEED_EVAL_GRAD:
9346c845298Sjeremylt       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
935241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
936241a4b83SYohann       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
937241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
93864d3f0c0Sjeremylt       if (useCollograd) {
939ac421f39SYohann         data->G.in[i] = basis_data->d_collograd1d;
940ac421f39SYohann         code << "  __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
941ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
942ac421f39SYohann       } else {
943ac421f39SYohann         data->G.in[i] = basis_data->d_grad1d;
944ac421f39SYohann         code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
945241a4b83SYohann         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
946ac421f39SYohann       }
947241a4b83SYohann       break;
948241a4b83SYohann     case CEED_EVAL_WEIGHT:
949241a4b83SYohann       break; // No action
950241a4b83SYohann     case CEED_EVAL_DIV:
951241a4b83SYohann       break; // TODO: Not implemented
952241a4b83SYohann     case CEED_EVAL_CURL:
953241a4b83SYohann       break; // TODO: Not implemented
954241a4b83SYohann     }
955241a4b83SYohann   }
956920dcdc4Sjeremylt 
957818e0025SJeremy L Thompson   code << "\n  // -- Output field constants and basis data --\n";
958241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
959818e0025SJeremy L Thompson     code << "  // ---- Output field "<<i<<" ----\n";
960241a4b83SYohann     // Get elemsize, emode, ncomp
961241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
962241a4b83SYohann     CeedChk(ierr);
963241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
964241a4b83SYohann     CeedChk(ierr);
965241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
966241a4b83SYohann     CeedChk(ierr);
9674d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
968241a4b83SYohann     CeedChk(ierr);
969920dcdc4Sjeremylt 
970920dcdc4Sjeremylt     // Set field constants
971241a4b83SYohann     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
972920dcdc4Sjeremylt     if (basis != CEED_BASIS_COLLOCATED) {
973241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
974241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
975920dcdc4Sjeremylt     } else {
976920dcdc4Sjeremylt       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
977920dcdc4Sjeremylt     }
978920dcdc4Sjeremylt     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
979920dcdc4Sjeremylt 
980920dcdc4Sjeremylt     // Load basis data
981920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
982920dcdc4Sjeremylt     switch (emode) {
983920dcdc4Sjeremylt     case CEED_EVAL_NONE:
984920dcdc4Sjeremylt       break; // No action
985920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
9866c845298Sjeremylt       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
987241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
988241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
989241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
990241a4b83SYohann       break;
991241a4b83SYohann     case CEED_EVAL_GRAD:
9926c845298Sjeremylt       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
993241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
994241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
995241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
99664d3f0c0Sjeremylt       if (useCollograd) {
997ac421f39SYohann         data->G.out[i] = basis_data->d_collograd1d;
998ac421f39SYohann         code << "  __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
999ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1000ac421f39SYohann       } else {
1001ac421f39SYohann         data->G.out[i] = basis_data->d_grad1d;
1002ac421f39SYohann         code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1003241a4b83SYohann         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1004ac421f39SYohann       }
1005241a4b83SYohann       break;
1006e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
1007241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1008241a4b83SYohann       Ceed ceed;
1009241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1010241a4b83SYohann       return CeedError(ceed, 1,
1011241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1012241a4b83SYohann       break; // Should not occur
1013241a4b83SYohann     }
1014241a4b83SYohann     case CEED_EVAL_DIV:
1015241a4b83SYohann       break; // TODO: Not implemented
1016241a4b83SYohann     case CEED_EVAL_CURL:
1017241a4b83SYohann       break; // TODO: Not implemented
1018e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
1019241a4b83SYohann     }
1020241a4b83SYohann   }
1021818e0025SJeremy L Thompson   code << "\n  // -- Element loop --\n";
1022ac421f39SYohann   code << "  __syncthreads();\n";
1023241a4b83SYohann   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1024241a4b83SYohann   // Input basis apply if needed
1025ac421f39SYohann   // Generate the correct eval mode code for each input
1026818e0025SJeremy L Thompson   code << "    // -- Input field restrictions and basis actions --\n";
1027241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1028818e0025SJeremy L Thompson     code << "    // ---- Input field "<<i<<" ----\n";
1029241a4b83SYohann     // Get elemsize, emode, ncomp
1030241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1031241a4b83SYohann     CeedChk(ierr);
1032241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1033241a4b83SYohann     CeedChk(ierr);
1034241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1035241a4b83SYohann     CeedChk(ierr);
10364d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1037241a4b83SYohann     CeedChk(ierr);
1038920dcdc4Sjeremylt 
1039920dcdc4Sjeremylt     // Restriction
1040920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT &&
104164d3f0c0Sjeremylt         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1042241a4b83SYohann       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
10439a2291e3SJeremy L Thompson 
10449a2291e3SJeremy L Thompson       bool isStrided;
1045fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
10469a2291e3SJeremy L Thompson       if (!isStrided) {
10475c7b696cSjeremylt         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
10485c7b696cSjeremylt         CeedChk(ierr);
10495c7b696cSjeremylt         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
10505c7b696cSjeremylt         CeedInt compstride;
10515c7b696cSjeremylt         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
10525c7b696cSjeremylt         code << "    // CompStride: "<<compstride<<"\n";
10536c845298Sjeremylt         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
10549a2291e3SJeremy L Thompson         data->indices.in[i] = restr_data->d_ind;
10555c7b696cSjeremylt         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1056ccedf6b0Sjeremylt       } else {
1057920dcdc4Sjeremylt         bool backendstrides;
1058fd364f38SJeremy L Thompson         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1059920dcdc4Sjeremylt         CeedChk(ierr);
106013585805SJeremy L Thompson         CeedInt nelem;
106113585805SJeremy L Thompson         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
106213585805SJeremy L Thompson         CeedChk(ierr);
106313585805SJeremy L Thompson         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1064920dcdc4Sjeremylt         if (!backendstrides) {
1065920dcdc4Sjeremylt           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1066920dcdc4Sjeremylt           CeedChk(ierr);
1067ccedf6b0Sjeremylt         }
1068920dcdc4Sjeremylt         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1069d80fc06aSjeremylt         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1070920dcdc4Sjeremylt       }
1071920dcdc4Sjeremylt     }
1072920dcdc4Sjeremylt 
1073920dcdc4Sjeremylt     // Basis action
1074920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1075920dcdc4Sjeremylt     switch (emode) {
1076920dcdc4Sjeremylt     case CEED_EVAL_NONE:
107764d3f0c0Sjeremylt       if (!useCollograd) {
1078920dcdc4Sjeremylt         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1079920dcdc4Sjeremylt       }
1080920dcdc4Sjeremylt       break;
1081920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
1082241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1083241a4b83SYohann       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1084241a4b83SYohann       break;
1085241a4b83SYohann     case CEED_EVAL_GRAD:
108664d3f0c0Sjeremylt       if (useCollograd) {
1087ac421f39SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1088ac421f39SYohann         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1089ac421f39SYohann       } else {
1090241a4b83SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1091241a4b83SYohann         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1092ac421f39SYohann       }
1093241a4b83SYohann       break;
1094241a4b83SYohann     case CEED_EVAL_WEIGHT:
1095241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1096241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
10976c845298Sjeremylt       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
1098241a4b83SYohann       data->W = basis_data->d_qweight1d;
1099241a4b83SYohann       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1100241a4b83SYohann       break; // No action
1101241a4b83SYohann     case CEED_EVAL_DIV:
1102241a4b83SYohann       break; // TODO: Not implemented
1103241a4b83SYohann     case CEED_EVAL_CURL:
1104241a4b83SYohann       break; // TODO: Not implemented
1105241a4b83SYohann     }
1106241a4b83SYohann   }
1107ac421f39SYohann 
1108241a4b83SYohann   // Q function
1109818e0025SJeremy L Thompson   code << "\n    // -- Output field setup --\n";
1110241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1111818e0025SJeremy L Thompson       code << "\n    // ---- Output field "<<i<<" ----\n";
1112241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1113241a4b83SYohann     CeedChk(ierr);
1114241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
1115241a4b83SYohann     {
111664d3f0c0Sjeremylt       if (useCollograd) {
1117ac421f39SYohann         //Accumulator for gradient slices
1118ac421f39SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1119ac421f39SYohann         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1120ac421f39SYohann         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1121ac421f39SYohann         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1122ac421f39SYohann         code << "      }\n";
1123ac421f39SYohann         code << "    }\n";
1124ac421f39SYohann       } else {
1125241a4b83SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1126241a4b83SYohann       }
1127ac421f39SYohann     }
1128241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1129241a4b83SYohann     {
1130241a4b83SYohann       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1131241a4b83SYohann     }
1132241a4b83SYohann   }
1133ac421f39SYohann   // We treat quadrature points per slice in 3d to save registers
113464d3f0c0Sjeremylt   if (useCollograd) {
1135920dcdc4Sjeremylt     code << "\n    // Note: Collocated Gradient\n";
1136ac421f39SYohann     code << "#pragma unroll\n";
1137ac421f39SYohann     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1138818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1139ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1140818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1141ac421f39SYohann       // Get elemsize, emode, ncomp
1142ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1143ac421f39SYohann       CeedChk(ierr);
1144ac421f39SYohann       // Basis action
1145920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1146ac421f39SYohann       switch (emode) {
1147ac421f39SYohann       case CEED_EVAL_NONE:
1148ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11499a2291e3SJeremy L Thompson 
11509a2291e3SJeremy L Thompson         bool isStrided;
1151792ff326SYohann Dudouit         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr);
115213585805SJeremy L Thompson         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr);
1153fd364f38SJeremy L Thompson         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
11549a2291e3SJeremy L Thompson         if (!isStrided) {
11555c7b696cSjeremylt           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
11565c7b696cSjeremylt           CeedChk(ierr);
11575c7b696cSjeremylt           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
11585c7b696cSjeremylt           CeedInt compstride;
11595c7b696cSjeremylt           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
11605c7b696cSjeremylt           code << "      // CompStride: "<<compstride<<"\n";
11616c845298Sjeremylt           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
11629a2291e3SJeremy L Thompson           data->indices.in[i] = restr_data->d_ind;
11635c7b696cSjeremylt           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1164920dcdc4Sjeremylt         } else {
1165920dcdc4Sjeremylt           bool backendstrides;
1166fd364f38SJeremy L Thompson           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1167920dcdc4Sjeremylt           CeedChk(ierr);
116813585805SJeremy L Thompson           CeedInt nelem;
116913585805SJeremy L Thompson           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
117013585805SJeremy L Thompson           CeedChk(ierr);
117113585805SJeremy L Thompson           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1172920dcdc4Sjeremylt           if (!backendstrides) {
1173920dcdc4Sjeremylt             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1174920dcdc4Sjeremylt             CeedChk(ierr);
1175920dcdc4Sjeremylt           }
1176920dcdc4Sjeremylt           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1177d80fc06aSjeremylt           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1178920dcdc4Sjeremylt         }
1179ac421f39SYohann         break;
1180ac421f39SYohann       case CEED_EVAL_INTERP:
1181ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1182ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1183ac421f39SYohann         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1184ac421f39SYohann         code << "      }\n";
1185ac421f39SYohann         break;
1186ac421f39SYohann       case CEED_EVAL_GRAD:
1187ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1188ac421f39SYohann         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1189ac421f39SYohann         break;
1190ac421f39SYohann       case CEED_EVAL_WEIGHT:
1191ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[1];\n";
1192ac421f39SYohann         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1193ac421f39SYohann         break; // No action
1194ac421f39SYohann       case CEED_EVAL_DIV:
1195ac421f39SYohann         break; // TODO: Not implemented
1196ac421f39SYohann       case CEED_EVAL_CURL:
1197ac421f39SYohann         break; // TODO: Not implemented
1198ac421f39SYohann       }
1199ac421f39SYohann     }
1200818e0025SJeremy L Thompson     code << "\n      // -- Output fields --\n";
1201ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1202818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1203ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1204ac421f39SYohann       CeedChk(ierr);
1205ac421f39SYohann       // Basis action
1206ac421f39SYohann       switch (emode) {
1207ac421f39SYohann       case CEED_EVAL_NONE:
1208ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1209ac421f39SYohann         break; // No action
1210ac421f39SYohann       case CEED_EVAL_INTERP:
1211ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1212ac421f39SYohann         break;
1213ac421f39SYohann       case CEED_EVAL_GRAD:
1214ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1215ac421f39SYohann         break;
1216ac421f39SYohann       case CEED_EVAL_WEIGHT:
1217ac421f39SYohann         break; // Should not occur
1218ac421f39SYohann       case CEED_EVAL_DIV:
1219ac421f39SYohann         break; // TODO: Not implemented
1220ac421f39SYohann       case CEED_EVAL_CURL:
1221ac421f39SYohann         break; // TODO: Not implemented
1222ac421f39SYohann       }
1223ac421f39SYohann     }
1224ac421f39SYohann   } else {
1225920dcdc4Sjeremylt     code << "\n      // Note: No Collocated Gradient\n";
1226818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1227ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1228818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1229ac421f39SYohann       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1230ac421f39SYohann     }
1231818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1232ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1233818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1234ac421f39SYohann       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1235ac421f39SYohann     }
1236ac421f39SYohann   }
1237818e0025SJeremy L Thompson   code << "\n      // -- QFunction Inputs and outputs --\n";
12384d537eeaSYohann   code << "      CeedScalar* in["<<numinputfields<<"];\n";
12394d537eeaSYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1240818e0025SJeremy L Thompson     code << "      // ---- Input field "<<i<<" ----\n";
1241ac421f39SYohann     code << "      in["<<i<<"] = r_q"<<i<<";\n";
12424d537eeaSYohann   }
12434d537eeaSYohann   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
12444d537eeaSYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1245818e0025SJeremy L Thompson     code << "      // ---- Output field "<<i<<" ----\n";
1246ac421f39SYohann     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
12474d537eeaSYohann   }
1248818e0025SJeremy L Thompson   code << "\n      // -- Apply QFunction --\n";
1249ac421f39SYohann   code << "      "<<qFunctionName<<"(ctx, ";
125064d3f0c0Sjeremylt   if (dim != 3 || useCollograd) {
1251ac421f39SYohann     code << "1";
1252ac421f39SYohann   } else {
1253ac421f39SYohann     code << "Q1d";
1254ac421f39SYohann   }
1255ac421f39SYohann   code << ", in, out);\n";
125664d3f0c0Sjeremylt   if (useCollograd) {
1257920dcdc4Sjeremylt     code << "\n      // Note: Collocated Gradient\n";
1258818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1259ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1260818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1261ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1262ac421f39SYohann       CeedChk(ierr);
1263ac421f39SYohann       // Basis action
1264920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1265ac421f39SYohann       switch (emode) {
1266ac421f39SYohann       case CEED_EVAL_NONE:
1267ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1268ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1269ac421f39SYohann         code << "      }\n";
1270ac421f39SYohann         break; // No action
1271ac421f39SYohann       case CEED_EVAL_INTERP:
1272ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1273ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1274ac421f39SYohann         code << "      }\n";
1275ac421f39SYohann         break;
1276ac421f39SYohann       case CEED_EVAL_GRAD:
1277ac421f39SYohann         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1278ac421f39SYohann         break;
1279ac421f39SYohann       case CEED_EVAL_WEIGHT:
1280ac421f39SYohann         break; // Should not occur
1281ac421f39SYohann       case CEED_EVAL_DIV:
1282ac421f39SYohann         break; // TODO: Not implemented
1283ac421f39SYohann       case CEED_EVAL_CURL:
1284ac421f39SYohann         break; // TODO: Not implemented
1285ac421f39SYohann       }
1286ac421f39SYohann     }
1287ac421f39SYohann     code << "    }\n";
1288ac421f39SYohann   }
1289241a4b83SYohann 
1290241a4b83SYohann   // Output basis apply if needed
1291ac421f39SYohann   // Generate the correct eval mode code for each output
1292818e0025SJeremy L Thompson   code << "\n    // -- Output field basis action and restrictions --\n";
1293241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1294818e0025SJeremy L Thompson     code << "    // ---- Output field "<<i<<" ----\n";
1295241a4b83SYohann     // Get elemsize, emode, ncomp
1296241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1297241a4b83SYohann     CeedChk(ierr);
1298241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1299241a4b83SYohann     CeedChk(ierr);
1300241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1301241a4b83SYohann     CeedChk(ierr);
13024d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1303241a4b83SYohann     CeedChk(ierr);
1304241a4b83SYohann     // Basis action
1305920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1306241a4b83SYohann     switch (emode) {
1307241a4b83SYohann     case CEED_EVAL_NONE:
1308920dcdc4Sjeremylt       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1309241a4b83SYohann       break; // No action
1310241a4b83SYohann     case CEED_EVAL_INTERP:
1311241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1312241a4b83SYohann       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1313241a4b83SYohann       break;
1314241a4b83SYohann     case CEED_EVAL_GRAD:
1315241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
131664d3f0c0Sjeremylt       if (useCollograd) {
1317ac421f39SYohann         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1318ac421f39SYohann       } else {
1319241a4b83SYohann         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1320ac421f39SYohann       }
1321241a4b83SYohann       break;
1322e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
1323241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1324241a4b83SYohann       Ceed ceed;
1325241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1326241a4b83SYohann       return CeedError(ceed, 1,
1327241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1328241a4b83SYohann       break; // Should not occur
1329241a4b83SYohann     }
1330241a4b83SYohann     case CEED_EVAL_DIV:
1331241a4b83SYohann       break; // TODO: Not implemented
1332241a4b83SYohann     case CEED_EVAL_CURL:
1333241a4b83SYohann       break; // TODO: Not implemented
1334e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
1335241a4b83SYohann     }
1336920dcdc4Sjeremylt     // Restriction
13379a2291e3SJeremy L Thompson       bool isStrided;
1338fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
13399a2291e3SJeremy L Thompson     if (!isStrided) {
13405c7b696cSjeremylt       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
13415c7b696cSjeremylt       CeedChk(ierr);
13425c7b696cSjeremylt       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
13435c7b696cSjeremylt       CeedInt compstride;
13445c7b696cSjeremylt       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
13455c7b696cSjeremylt       code << "    // CompStride: "<<compstride<<"\n";
13466c845298Sjeremylt       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
13479a2291e3SJeremy L Thompson       data->indices.out[i] = restr_data->d_ind;
13485c7b696cSjeremylt       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1349920dcdc4Sjeremylt     } else {
1350920dcdc4Sjeremylt       bool backendstrides;
1351fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1352920dcdc4Sjeremylt       CeedChk(ierr);
135313585805SJeremy L Thompson       CeedInt nelem;
135413585805SJeremy L Thompson       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
135513585805SJeremy L Thompson       CeedChk(ierr);
135613585805SJeremy L Thompson       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1357920dcdc4Sjeremylt       if (!backendstrides) {
1358920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1359920dcdc4Sjeremylt         CeedChk(ierr);
1360920dcdc4Sjeremylt       }
1361920dcdc4Sjeremylt       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1362d80fc06aSjeremylt       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1363920dcdc4Sjeremylt     }
1364241a4b83SYohann   }
1365241a4b83SYohann 
1366241a4b83SYohann   code << "  }\n";
1367818e0025SJeremy L Thompson   code << "}\n";
1368818e0025SJeremy L Thompson   code << "// -----------------------------------------------------------------------------\n\n";
1369241a4b83SYohann 
1370ab213215SJeremy L Thompson   // View kernel for debugging
1371b8e71988SJeremy L Thompson   CeedDebug(code.str().c_str());
1372241a4b83SYohann 
1373*18d499f1SYohann   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
1374*18d499f1SYohann                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1375920dcdc4Sjeremylt   CeedChk(ierr);
1376c3c443faSYohann Dudouit   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1377241a4b83SYohann   CeedChk(ierr);
1378241a4b83SYohann 
1379241a4b83SYohann   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1380241a4b83SYohann   return 0;
1381241a4b83SYohann }
1382ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1383