xref: /libCEED/rust/libceed-sys/c-src/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision c3c443fa4dd888aca94af004b304a7a213962033)
1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4241a4b83SYohann //
5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7241a4b83SYohann // element discretizations for exascale applications. For more information and
8241a4b83SYohann // source code availability see http://github.com/ceed.
9241a4b83SYohann //
10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
167df94212SJeremy L Thompson 
17241a4b83SYohann #include "ceed-cuda-gen.h"
18241a4b83SYohann #include <iostream>
19241a4b83SYohann #include <sstream>
20241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h"
21241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
22241a4b83SYohann 
23f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
24ab213215SJeremy L Thompson //------------------------------------------------------------------------------
25ab213215SJeremy L Thompson // Atomic add, for older CUDA
26ab213215SJeremy L Thompson //------------------------------------------------------------------------------
27241a4b83SYohann __device__ double atomicAdd(double *address, double val) {
28241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
29241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
30241a4b83SYohann   do {
31241a4b83SYohann     assumed = old;
32241a4b83SYohann     old =
33241a4b83SYohann       atomicCAS(address_as_ull, assumed,
34241a4b83SYohann                 __double_as_longlong(val +
35241a4b83SYohann                                      __longlong_as_double(assumed)));
36241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
37241a4b83SYohann     // (since NaN != NaN)
38241a4b83SYohann   } while (assumed != old);
39241a4b83SYohann   return __longlong_as_double(old);
40241a4b83SYohann }
41f1a13f77SYohann Dudouit );
42f1a13f77SYohann Dudouit 
43f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
44f1a13f77SYohann Dudouit 
45ab213215SJeremy L Thompson //------------------------------------------------------------------------------
46ab213215SJeremy L Thompson // Typedefs
47ab213215SJeremy L Thompson //------------------------------------------------------------------------------
48f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
49f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
50f1a13f77SYohann Dudouit 
51f1a13f77SYohann Dudouit typedef struct {
52f1a13f77SYohann Dudouit   CeedInt tidx;
53f1a13f77SYohann Dudouit   CeedInt tidy;
54f1a13f77SYohann Dudouit   CeedInt tidz;
55f1a13f77SYohann Dudouit   CeedInt tid;
56f1a13f77SYohann Dudouit   CeedScalar* slice;
57f1a13f77SYohann Dudouit } BackendData;
58241a4b83SYohann 
59ab213215SJeremy L Thompson //------------------------------------------------------------------------------
60ab213215SJeremy L Thompson // Load matrices for basis actions
61ab213215SJeremy L Thompson //------------------------------------------------------------------------------
62241a4b83SYohann template <int P, int Q>
63241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
64ab213215SJeremy L Thompson   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
65241a4b83SYohann     B[i] = d_B[i];
66241a4b83SYohann }
67241a4b83SYohann 
68ab213215SJeremy L Thompson //------------------------------------------------------------------------------
69241a4b83SYohann // 1D
70ab213215SJeremy L Thompson //------------------------------------------------------------------------------
71ab213215SJeremy L Thompson 
72ab213215SJeremy L Thompson //------------------------------------------------------------------------------
73ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
74ab213215SJeremy L Thompson //------------------------------------------------------------------------------
755c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
765c7b696cSjeremylt inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
77ab213215SJeremy L Thompson   if (data.tidx < P1d) {
784d537eeaSYohann     const CeedInt node = data.tidx;
79ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
80ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
815c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
82241a4b83SYohann   }
83241a4b83SYohann }
84920dcdc4Sjeremylt 
85ab213215SJeremy L Thompson //------------------------------------------------------------------------------
86ab213215SJeremy L Thompson // L-vector -> E-vector, strided
87ab213215SJeremy L Thompson //------------------------------------------------------------------------------
88d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
89d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
90ab213215SJeremy L Thompson   if (data.tidx < P1d) {
91ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
92d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
93ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
94d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
95ccedf6b0Sjeremylt   }
96ccedf6b0Sjeremylt }
97241a4b83SYohann 
98ab213215SJeremy L Thompson //------------------------------------------------------------------------------
99ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
100ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1015c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
1025c7b696cSjeremylt inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
103ab213215SJeremy L Thompson   if (data.tidx < P1d) {
1044d537eeaSYohann     const CeedInt node = data.tidx;
105ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
106ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1075c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
108241a4b83SYohann   }
109241a4b83SYohann }
110241a4b83SYohann 
111ab213215SJeremy L Thompson //------------------------------------------------------------------------------
112ab213215SJeremy L Thompson // E-vector -> L-vector, strided
113ab213215SJeremy L Thompson //------------------------------------------------------------------------------
114d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
115d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
116ab213215SJeremy L Thompson   if (data.tidx < P1d) {
117ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
118d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
119ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
120d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
121ccedf6b0Sjeremylt   }
122ccedf6b0Sjeremylt }
123ccedf6b0Sjeremylt 
124ab213215SJeremy L Thompson //------------------------------------------------------------------------------
125ab213215SJeremy L Thompson // 1D tensor contraction x
126ab213215SJeremy L Thompson //------------------------------------------------------------------------------
127241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
128ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
129241a4b83SYohann   data.slice[data.tidx] = *U;
130241a4b83SYohann   __syncthreads();
131241a4b83SYohann   *V = 0.0;
132ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
133ab213215SJeremy L Thompson     *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
134241a4b83SYohann   __syncthreads();
135241a4b83SYohann }
136241a4b83SYohann 
137ab213215SJeremy L Thompson //------------------------------------------------------------------------------
138ab213215SJeremy L Thompson // 1D transpose tensor contraction x
139ab213215SJeremy L Thompson //------------------------------------------------------------------------------
140241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
141ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
142241a4b83SYohann   data.slice[data.tidx] = *U;
143241a4b83SYohann   __syncthreads();
144241a4b83SYohann   *V = 0.0;
145ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
146ab213215SJeremy L Thompson     *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
147241a4b83SYohann   __syncthreads();
148241a4b83SYohann }
149241a4b83SYohann 
150ab213215SJeremy L Thompson //------------------------------------------------------------------------------
151ab213215SJeremy L Thompson // 1D interpolate to quadrature points
152ab213215SJeremy L Thompson //------------------------------------------------------------------------------
153241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
154ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
155ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
156241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
157241a4b83SYohann }
158241a4b83SYohann 
159ab213215SJeremy L Thompson //------------------------------------------------------------------------------
160ab213215SJeremy L Thompson // 1D interpolate transpose
161ab213215SJeremy L Thompson //------------------------------------------------------------------------------
162241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
163ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
164ab213215SJeremy L Thompson   for (CeedInt comp=0; comp<NCOMP; comp++)
165241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
166241a4b83SYohann }
167241a4b83SYohann 
168ab213215SJeremy L Thompson //------------------------------------------------------------------------------
169ab213215SJeremy L Thompson // 1D derivatives at quadrature points
170ab213215SJeremy L Thompson //------------------------------------------------------------------------------
171241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
172ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
173ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
174241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
175241a4b83SYohann }
176241a4b83SYohann 
177ab213215SJeremy L Thompson //------------------------------------------------------------------------------
178ab213215SJeremy L Thompson // 1D derivatives transpose
179ab213215SJeremy L Thompson //------------------------------------------------------------------------------
180241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
181ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
182ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
183241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
184241a4b83SYohann }
185241a4b83SYohann 
186ab213215SJeremy L Thompson //------------------------------------------------------------------------------
187241a4b83SYohann // 2D
188ab213215SJeremy L Thompson //------------------------------------------------------------------------------
189ab213215SJeremy L Thompson 
190ab213215SJeremy L Thompson //------------------------------------------------------------------------------
191ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
192ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1935c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
1945c7b696cSjeremylt inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
195ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
1964d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
197ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
198ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1995c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
200241a4b83SYohann   }
201241a4b83SYohann }
202920dcdc4Sjeremylt 
203ab213215SJeremy L Thompson //------------------------------------------------------------------------------
204ab213215SJeremy L Thompson // L-vector -> E-vector, strided
205ab213215SJeremy L Thompson //------------------------------------------------------------------------------
206d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
207d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
208ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
209ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
210d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
211ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
212d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
213ccedf6b0Sjeremylt   }
214ccedf6b0Sjeremylt }
215241a4b83SYohann 
216ab213215SJeremy L Thompson //------------------------------------------------------------------------------
217ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
218ab213215SJeremy L Thompson //------------------------------------------------------------------------------
2195c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
2205c7b696cSjeremylt inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
221ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
2224d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
223ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
224ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2255c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
226241a4b83SYohann   }
227241a4b83SYohann }
228241a4b83SYohann 
229ab213215SJeremy L Thompson //------------------------------------------------------------------------------
230ab213215SJeremy L Thompson // E-vector -> L-vector, strided
231ab213215SJeremy L Thompson //------------------------------------------------------------------------------
232d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
233d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
234ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
235ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
236d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
237ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
238d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
239ccedf6b0Sjeremylt   }
240ccedf6b0Sjeremylt }
241ccedf6b0Sjeremylt 
242ab213215SJeremy L Thompson //------------------------------------------------------------------------------
243ab213215SJeremy L Thompson // 2D tensor contraction x
244ab213215SJeremy L Thompson //------------------------------------------------------------------------------
245241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
246ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
247241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
248241a4b83SYohann   __syncthreads();
249241a4b83SYohann   *V = 0.0;
250ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
251ab213215SJeremy L Thompson     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
252241a4b83SYohann   __syncthreads();
253241a4b83SYohann }
254241a4b83SYohann 
255ab213215SJeremy L Thompson //------------------------------------------------------------------------------
256ab213215SJeremy L Thompson // 2D tensor contract y
257ab213215SJeremy L Thompson //------------------------------------------------------------------------------
258241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
259ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
260241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
261241a4b83SYohann   __syncthreads();
262241a4b83SYohann   *V = 0.0;
263ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
264ab213215SJeremy L Thompson     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
265241a4b83SYohann   __syncthreads();
266241a4b83SYohann }
267241a4b83SYohann 
268ab213215SJeremy L Thompson //------------------------------------------------------------------------------
269ab213215SJeremy L Thompson // 2D transpose tensor contract y
270ab213215SJeremy L Thompson //------------------------------------------------------------------------------
271241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
272ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
273241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
274241a4b83SYohann   __syncthreads();
275241a4b83SYohann   *V = 0.0;
276ab213215SJeremy L Thompson   if (data.tidy < P1d)
277ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
278ab213215SJeremy L Thompson       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
279241a4b83SYohann   __syncthreads();
280241a4b83SYohann }
281241a4b83SYohann 
282ab213215SJeremy L Thompson //------------------------------------------------------------------------------
283ab213215SJeremy L Thompson // 2D transpose tensor contract x
284ab213215SJeremy L Thompson //------------------------------------------------------------------------------
285241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
286ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
287241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
288241a4b83SYohann   __syncthreads();
289241a4b83SYohann   *V = 0.0;
290ab213215SJeremy L Thompson   if (data.tidx < P1d)
291ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
292ab213215SJeremy L Thompson       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
293241a4b83SYohann   __syncthreads();
294241a4b83SYohann }
295241a4b83SYohann 
296ab213215SJeremy L Thompson //------------------------------------------------------------------------------
297ab213215SJeremy L Thompson // 2D transpose tensor contract and add x
298ab213215SJeremy L Thompson //------------------------------------------------------------------------------
299241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
300ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
301241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
302241a4b83SYohann   __syncthreads();
303ab213215SJeremy L Thompson   if (data.tidx < P1d)
304ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
305ab213215SJeremy L Thompson       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
306241a4b83SYohann   __syncthreads();
307241a4b83SYohann }
308241a4b83SYohann 
309ab213215SJeremy L Thompson //------------------------------------------------------------------------------
310ab213215SJeremy L Thompson // 2D interpolate to quadrature points
311ab213215SJeremy L Thompson //------------------------------------------------------------------------------
312241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
313ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
314241a4b83SYohann   CeedScalar r_t[1];
315ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
316241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
317241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
318241a4b83SYohann   }
319241a4b83SYohann }
320241a4b83SYohann 
321ab213215SJeremy L Thompson //------------------------------------------------------------------------------
322ab213215SJeremy L Thompson // 2D interpolate transpose
323ab213215SJeremy L Thompson //------------------------------------------------------------------------------
324241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
325ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
326241a4b83SYohann   CeedScalar r_t[1];
327ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
328241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
329241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
330241a4b83SYohann   }
331241a4b83SYohann }
332241a4b83SYohann 
333ab213215SJeremy L Thompson //------------------------------------------------------------------------------
334ab213215SJeremy L Thompson // 2D derivatives at quadrature points
335ab213215SJeremy L Thompson //------------------------------------------------------------------------------
336241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
337ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
338241a4b83SYohann   CeedScalar r_t[1];
339ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
340241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
341241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
342241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
343241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
344241a4b83SYohann   }
345241a4b83SYohann }
346241a4b83SYohann 
347ab213215SJeremy L Thompson //------------------------------------------------------------------------------
348ab213215SJeremy L Thompson // 2D derivatives transpose
349ab213215SJeremy L Thompson //------------------------------------------------------------------------------
350241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
351ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
352241a4b83SYohann   CeedScalar r_t[1];
353ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
354241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
355241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
356241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
357241a4b83SYohann     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
358241a4b83SYohann   }
359241a4b83SYohann }
360241a4b83SYohann 
361ab213215SJeremy L Thompson //------------------------------------------------------------------------------
362241a4b83SYohann // 3D
363ab213215SJeremy L Thompson //------------------------------------------------------------------------------
364ab213215SJeremy L Thompson 
365ab213215SJeremy L Thompson //------------------------------------------------------------------------------
366ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
367ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3685c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
3695c7b696cSjeremylt inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
370ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
371241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
3724d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
373ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
374ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3755c7b696cSjeremylt         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
376241a4b83SYohann     }
377241a4b83SYohann }
378920dcdc4Sjeremylt 
379ab213215SJeremy L Thompson //------------------------------------------------------------------------------
380ab213215SJeremy L Thompson // L-vector -> E-vector, strided
381ab213215SJeremy L Thompson //------------------------------------------------------------------------------
382d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
383d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
384ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
385ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
386ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
387d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
388ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
389d80fc06aSjeremylt         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
390ccedf6b0Sjeremylt     }
391ccedf6b0Sjeremylt }
392241a4b83SYohann 
393ab213215SJeremy L Thompson //------------------------------------------------------------------------------
394ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided
395ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3965c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d>
3975c7b696cSjeremylt inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
398ac421f39SYohann   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
399920dcdc4Sjeremylt   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
400ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; ++comp)
4015c7b696cSjeremylt     r_u[comp] = d_u[ind + COMPSTRIDE * comp];
402ac421f39SYohann }
403ac421f39SYohann 
404ab213215SJeremy L Thompson //------------------------------------------------------------------------------
405ab213215SJeremy L Thompson // E-vector -> Q-vector, strided
406ab213215SJeremy L Thompson //------------------------------------------------------------------------------
407d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
40825dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
409920dcdc4Sjeremylt   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
410d80fc06aSjeremylt   const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
411ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; ++comp)
412d80fc06aSjeremylt     r_u[comp] = d_u[ind + comp * STRIDES_COMP];
413920dcdc4Sjeremylt }
414920dcdc4Sjeremylt 
415ab213215SJeremy L Thompson //------------------------------------------------------------------------------
416ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
417ab213215SJeremy L Thompson //------------------------------------------------------------------------------
4185c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
4195c7b696cSjeremylt inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
420ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
421241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4224d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
423ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
424ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4255c7b696cSjeremylt         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
426241a4b83SYohann     }
427241a4b83SYohann }
428241a4b83SYohann 
429ab213215SJeremy L Thompson //------------------------------------------------------------------------------
430ab213215SJeremy L Thompson // E-vector -> L-vector, strided
431ab213215SJeremy L Thompson //------------------------------------------------------------------------------
432d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4332f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
434ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
435ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
436ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
437d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
438ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
439d80fc06aSjeremylt         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
440ccedf6b0Sjeremylt     }
441ccedf6b0Sjeremylt }
442ccedf6b0Sjeremylt 
443ab213215SJeremy L Thompson //------------------------------------------------------------------------------
444ab213215SJeremy L Thompson // 3D tensor contract x
445ab213215SJeremy L Thompson //------------------------------------------------------------------------------
446241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
447ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
448ac421f39SYohann   CeedScalar r_B[P1d];
449ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
450ac421f39SYohann     r_B[i] = B[i + data.tidx*P1d];
451ab213215SJeremy L Thompson 
452ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
453241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
454241a4b83SYohann     __syncthreads();
455241a4b83SYohann     V[k] = 0.0;
456ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
457ab213215SJeremy L Thompson       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
458241a4b83SYohann     __syncthreads();
459241a4b83SYohann   }
460241a4b83SYohann }
461241a4b83SYohann 
462ab213215SJeremy L Thompson //------------------------------------------------------------------------------
463ab213215SJeremy L Thompson // 3D tensor contract y
464ab213215SJeremy L Thompson //------------------------------------------------------------------------------
465241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
466ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
467ac421f39SYohann   CeedScalar r_B[P1d];
468ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
469ac421f39SYohann     r_B[i] = B[i + data.tidy*P1d];
470ab213215SJeremy L Thompson 
471ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
472241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
473241a4b83SYohann     __syncthreads();
474241a4b83SYohann     V[k] = 0.0;
475ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
476ab213215SJeremy L Thompson       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
477241a4b83SYohann     __syncthreads();
478241a4b83SYohann   }
479241a4b83SYohann }
480241a4b83SYohann 
481ab213215SJeremy L Thompson //------------------------------------------------------------------------------
482ab213215SJeremy L Thompson // 3D tensor contract z
483ab213215SJeremy L Thompson //------------------------------------------------------------------------------
484241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
485ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
486ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
487241a4b83SYohann     V[k] = 0.0;
488ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
489ab213215SJeremy L Thompson       V[k] += B[i + k*P1d] * U[i]; // Contract z direction
490241a4b83SYohann   }
491241a4b83SYohann }
492241a4b83SYohann 
493ab213215SJeremy L Thompson //------------------------------------------------------------------------------
494ab213215SJeremy L Thompson // 3D transpose tensor contract z
495ab213215SJeremy L Thompson //------------------------------------------------------------------------------
496241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
497ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
498ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
499241a4b83SYohann     V[k] = 0.0;
500ab213215SJeremy L Thompson     if (k < P1d)
501ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
502ab213215SJeremy L Thompson         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
503241a4b83SYohann   }
504241a4b83SYohann }
505241a4b83SYohann 
506ab213215SJeremy L Thompson //------------------------------------------------------------------------------
507ab213215SJeremy L Thompson // 3D transpose tensor contract y
508ab213215SJeremy L Thompson //------------------------------------------------------------------------------
509241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
510ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
511ac421f39SYohann   CeedScalar r_B[Q1d];
512ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i)
513ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
514ab213215SJeremy L Thompson 
515ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
516241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
517241a4b83SYohann     __syncthreads();
518241a4b83SYohann     V[k] = 0.0;
519ab213215SJeremy L Thompson     if (data.tidy < P1d)
520ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
521ab213215SJeremy L Thompson         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
522ac421f39SYohann     __syncthreads();
523ac421f39SYohann   }
524ac421f39SYohann }
525ac421f39SYohann 
526ab213215SJeremy L Thompson //------------------------------------------------------------------------------
527ab213215SJeremy L Thompson // 3D transpose tensor contract add y
528ab213215SJeremy L Thompson //------------------------------------------------------------------------------
529ac421f39SYohann template <int NCOMP, int P1d, int Q1d>
530ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
531ac421f39SYohann   CeedScalar r_B[Q1d];
532ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
533ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
534ab213215SJeremy L Thompson 
535ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
536ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
537ac421f39SYohann     __syncthreads();
538ab213215SJeremy L Thompson     if (data.tidy < P1d)
539ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
540ab213215SJeremy L Thompson         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
541241a4b83SYohann     __syncthreads();
542241a4b83SYohann   }
543241a4b83SYohann }
544241a4b83SYohann 
545ab213215SJeremy L Thompson //------------------------------------------------------------------------------
546ab213215SJeremy L Thompson // 3D transpose tensor contract x
547ab213215SJeremy L Thompson //------------------------------------------------------------------------------
548241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
549ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
550ac421f39SYohann   CeedScalar r_B[Q1d];
551ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
552ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
553ab213215SJeremy L Thompson 
554ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
555241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
556241a4b83SYohann     __syncthreads();
557241a4b83SYohann     V[k] = 0.0;
558ab213215SJeremy L Thompson     if (data.tidx < P1d)
559ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
560ab213215SJeremy L Thompson         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
561241a4b83SYohann     __syncthreads();
562241a4b83SYohann   }
563241a4b83SYohann }
564241a4b83SYohann 
565ab213215SJeremy L Thompson //------------------------------------------------------------------------------
566ab213215SJeremy L Thompson // 3D transpose tensor contract add x
567ab213215SJeremy L Thompson //------------------------------------------------------------------------------
568241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
569ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
570ac421f39SYohann   CeedScalar r_B[Q1d];
571ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
572ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
573ab213215SJeremy L Thompson 
574ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
575241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
576241a4b83SYohann     __syncthreads();
577ab213215SJeremy L Thompson     if (data.tidx < P1d)
578ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
579ab213215SJeremy L Thompson         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
580241a4b83SYohann     __syncthreads();
581241a4b83SYohann   }
582241a4b83SYohann }
583241a4b83SYohann 
584ab213215SJeremy L Thompson //------------------------------------------------------------------------------
585ab213215SJeremy L Thompson // 3D interpolate to quadrature points
586ab213215SJeremy L Thompson //------------------------------------------------------------------------------
587241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
588ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
589241a4b83SYohann   CeedScalar r_t1[Q1d];
590241a4b83SYohann   CeedScalar r_t2[Q1d];
591ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
592241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
593241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
594241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
595241a4b83SYohann   }
596241a4b83SYohann }
597241a4b83SYohann 
598ab213215SJeremy L Thompson //------------------------------------------------------------------------------
599ab213215SJeremy L Thompson // 3D interpolate transpose
600ab213215SJeremy L Thompson //------------------------------------------------------------------------------
601241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
602ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
603241a4b83SYohann   CeedScalar r_t1[Q1d];
604241a4b83SYohann   CeedScalar r_t2[Q1d];
605ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
606241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
607241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
608241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
609241a4b83SYohann   }
610241a4b83SYohann }
611241a4b83SYohann 
612ab213215SJeremy L Thompson //------------------------------------------------------------------------------
613ab213215SJeremy L Thompson // 3D derivatives at quadrature points
614ab213215SJeremy L Thompson //------------------------------------------------------------------------------
615241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
616ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
617241a4b83SYohann   CeedScalar r_t1[Q1d];
618241a4b83SYohann   CeedScalar r_t2[Q1d];
619ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
620241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
621241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
622241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
623241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
624241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
625241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
626241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
627241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
628241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
629241a4b83SYohann   }
630241a4b83SYohann }
631241a4b83SYohann 
632ab213215SJeremy L Thompson //------------------------------------------------------------------------------
633ab213215SJeremy L Thompson // 3D derivatives transpose
634ab213215SJeremy L Thompson //------------------------------------------------------------------------------
635241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
636ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
637241a4b83SYohann   CeedScalar r_t1[Q1d];
638241a4b83SYohann   CeedScalar r_t2[Q1d];
639ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
640241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
641241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
642241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
643241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
644241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
645241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
646241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
647241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
648241a4b83SYohann     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
649241a4b83SYohann   }
650241a4b83SYohann }
651241a4b83SYohann 
652ab213215SJeremy L Thompson //------------------------------------------------------------------------------
653ab213215SJeremy L Thompson // 3D collocated derivatives computation
654ab213215SJeremy L Thompson //------------------------------------------------------------------------------
655ac421f39SYohann template <int NCOMP, int Q1d>
656ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
657ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
658ac421f39SYohann     data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d];
659ac421f39SYohann     __syncthreads();
660ac421f39SYohann     // X derivative
661ac421f39SYohann     r_V[comp+0*NCOMP] = 0.0;
662ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
663ab213215SJeremy L Thompson       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
664ac421f39SYohann     // Y derivative
665ac421f39SYohann     r_V[comp+1*NCOMP] = 0.0;
666ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
667ab213215SJeremy L Thompson       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
668ac421f39SYohann     // Z derivative
669ac421f39SYohann     r_V[comp+2*NCOMP] = 0.0;
670ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
671ab213215SJeremy L Thompson       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
672ac421f39SYohann     __syncthreads();
673ac421f39SYohann   }
674ac421f39SYohann }
675ac421f39SYohann 
676ab213215SJeremy L Thompson //------------------------------------------------------------------------------
677ab213215SJeremy L Thompson // 3D collocated derivatives transpose
678ab213215SJeremy L Thompson //------------------------------------------------------------------------------
679ac421f39SYohann template <int NCOMP, int Q1d>
680ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
681ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
682ac421f39SYohann     // X derivative
683ac421f39SYohann     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP];
684ac421f39SYohann     __syncthreads();
685ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
686ab213215SJeremy L Thompson       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
687ac421f39SYohann     __syncthreads();
688ac421f39SYohann     // Y derivative
689ac421f39SYohann     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP];
690ac421f39SYohann     __syncthreads();
691ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
692ab213215SJeremy L Thompson       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
693ac421f39SYohann     __syncthreads();
694ac421f39SYohann     // Z derivative
695ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
696ac421f39SYohann       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
697ac421f39SYohann   }
698ac421f39SYohann }
699ac421f39SYohann 
700ab213215SJeremy L Thompson //------------------------------------------------------------------------------
701ab213215SJeremy L Thompson // 1D quadrature weights
702ab213215SJeremy L Thompson //------------------------------------------------------------------------------
703241a4b83SYohann template <int Q1d>
704241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
705241a4b83SYohann   *w = qweight1d[data.tidx];
706241a4b83SYohann }
707241a4b83SYohann 
708ab213215SJeremy L Thompson //------------------------------------------------------------------------------
709ab213215SJeremy L Thompson // 2D quadrature weights
710ab213215SJeremy L Thompson //------------------------------------------------------------------------------
711241a4b83SYohann template <int Q1d>
712241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
713241a4b83SYohann   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
714241a4b83SYohann }
715241a4b83SYohann 
716ab213215SJeremy L Thompson //------------------------------------------------------------------------------
717ab213215SJeremy L Thompson // 3D quadrature weights
718ab213215SJeremy L Thompson //------------------------------------------------------------------------------
719241a4b83SYohann template <int Q1d>
720241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
721241a4b83SYohann   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
722ac421f39SYohann   for (CeedInt z = 0; z < Q1d; ++z)
723241a4b83SYohann     w[z] = pw*qweight1d[z];
724241a4b83SYohann }
725241a4b83SYohann 
726241a4b83SYohann );
727ab213215SJeremy L Thompson //------------------------------------------------------------------------------
728ab213215SJeremy L Thompson // Build singe operator kernel
729ab213215SJeremy L Thompson //------------------------------------------------------------------------------
730241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
731241a4b83SYohann 
732241a4b83SYohann   using std::ostringstream;
733241a4b83SYohann   using std::string;
734241a4b83SYohann   int ierr;
735241a4b83SYohann   bool setupdone;
736fd364f38SJeremy L Thompson   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
737241a4b83SYohann   if (setupdone) return 0;
738241a4b83SYohann   Ceed ceed;
739241a4b83SYohann   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
740241a4b83SYohann   CeedOperator_Cuda_gen *data;
741241a4b83SYohann   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
742241a4b83SYohann   CeedQFunction qf;
743241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
744241a4b83SYohann   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
745241a4b83SYohann   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
7461da99368SJeremy L Thompson   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
7475c7b696cSjeremylt           numoutputfields, ncomp, dim = 0, lsize;
748241a4b83SYohann   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
749241a4b83SYohann   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
750241a4b83SYohann   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
751241a4b83SYohann   CeedChk(ierr);
752241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
753241a4b83SYohann   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
754241a4b83SYohann   CeedChk(ierr);
755241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
756241a4b83SYohann   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
757241a4b83SYohann   CeedChk(ierr);
758241a4b83SYohann   CeedEvalMode emode;
759241a4b83SYohann   CeedBasis basis;
760241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
761241a4b83SYohann   CeedElemRestriction Erestrict;
762241a4b83SYohann   CeedElemRestriction_Cuda_reg *restr_data;
763241a4b83SYohann 
764241a4b83SYohann   ostringstream code;
765241a4b83SYohann   string devFunctions(deviceFunctions);
766241a4b83SYohann 
767f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
768f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
769f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
770abfaacbbSSander Arens   ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr);
771f1a13f77SYohann Dudouit   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
772f1a13f77SYohann Dudouit   if (prop.major<6){
773f1a13f77SYohann Dudouit     code << atomicAdd;
774f1a13f77SYohann Dudouit   }
775f1a13f77SYohann Dudouit 
776241a4b83SYohann   code << devFunctions;
777241a4b83SYohann 
778241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
779*c3c443faSYohann Dudouit   string qFunctionName(qf_data->qFunctionName);
780*c3c443faSYohann Dudouit   string oper;
781*c3c443faSYohann Dudouit   oper = "kernel_" + qFunctionName;
7824d537eeaSYohann 
7834d537eeaSYohann   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
784ee07ded2SValeria Barra   code << "\n#define CeedPragmaSIMD\n";
7851da99368SJeremy L Thompson 
7861da99368SJeremy L Thompson   // Find dim and Q1d
7871da99368SJeremy L Thompson   bool collograd = false;
7881da99368SJeremy L Thompson   for (CeedInt i = 0; i < numinputfields; i++) {
7891da99368SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
7901da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
7911da99368SJeremy L Thompson       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
7921da99368SJeremy L Thompson 
7931da99368SJeremy L Thompson       // Check for collocated gradient
7941da99368SJeremy L Thompson       if (basis_data->d_collograd1d)
7951da99368SJeremy L Thompson         collograd = true;
7961da99368SJeremy L Thompson 
7971da99368SJeremy L Thompson       // Collect dim and Q1d
7981da99368SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
7991da99368SJeremy L Thompson       bool isTensor;
800fd364f38SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
8011da99368SJeremy L Thompson       if (isTensor) {
8021da99368SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
8031da99368SJeremy L Thompson       } else {
8041da99368SJeremy L Thompson         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
8051da99368SJeremy L Thompson         }
8061da99368SJeremy L Thompson     }
8071da99368SJeremy L Thompson   }
8081da99368SJeremy L Thompson   // Check output bases for Q1d, dim as well
8091da99368SJeremy L Thompson   //   The only imput basis might be CEED_BASIS_COLLOCATED
8101da99368SJeremy L Thompson   for (CeedInt i = 0; i < numoutputfields; i++) {
8111da99368SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
8121da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
8131da99368SJeremy L Thompson       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
8141da99368SJeremy L Thompson       // Collect dim and Q1d
8151da99368SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
8161da99368SJeremy L Thompson       bool isTensor;
817fd364f38SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
8181da99368SJeremy L Thompson       if (isTensor) {
8191da99368SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
8201da99368SJeremy L Thompson       } else {
8211da99368SJeremy L Thompson         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
8221da99368SJeremy L Thompson         }
8231da99368SJeremy L Thompson     }
8241da99368SJeremy L Thompson   }
8251da99368SJeremy L Thompson   data->dim = dim;
8261da99368SJeremy L Thompson   data->Q1d = Q1d;
8271da99368SJeremy L Thompson 
8281da99368SJeremy L Thompson   // Define CEED_Q_VLA
8291da99368SJeremy L Thompson   if (dim != 3 || collograd) {
8301da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA 1\n\n";
8311da99368SJeremy L Thompson   } else {
8321da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
8331da99368SJeremy L Thompson   }
8341da99368SJeremy L Thompson 
835241a4b83SYohann   code << qFunction;
836241a4b83SYohann 
837241a4b83SYohann   // Setup
838*c3c443faSYohann Dudouit   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
839241a4b83SYohann   // Input Evecs and Restriction
840241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
841241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
842241a4b83SYohann     CeedChk(ierr);
8431da99368SJeremy L Thompson     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
844241a4b83SYohann       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
845241a4b83SYohann     }
846241a4b83SYohann   }
847241a4b83SYohann 
848241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
849241a4b83SYohann     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
850241a4b83SYohann   }
8511da99368SJeremy L Thompson 
852241a4b83SYohann   code << "const CeedInt Dim = "<<dim<<";\n";
853241a4b83SYohann   code << "const CeedInt Q1d = "<<Q1d<<";\n";
8541da99368SJeremy L Thompson 
855241a4b83SYohann   code << "extern __shared__ CeedScalar slice[];\n";
856241a4b83SYohann   code << "BackendData data;\n";
857241a4b83SYohann   code << "data.tidx = threadIdx.x;\n";
858241a4b83SYohann   code << "data.tidy = threadIdx.y;\n";
859241a4b83SYohann   code << "data.tidz = threadIdx.z;\n";
860241a4b83SYohann   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
861241a4b83SYohann   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
862920dcdc4Sjeremylt 
863920dcdc4Sjeremylt   code << "\n// Input field constants and basis data\n";
864ac421f39SYohann   //Initialize constants, and matrices B and G
865241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
866920dcdc4Sjeremylt     code << "// -- Input field "<<i<<" --\n";
867241a4b83SYohann     // Get elemsize, emode, ncomp
868241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
869241a4b83SYohann     CeedChk(ierr);
870241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
871241a4b83SYohann     CeedChk(ierr);
872241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
873241a4b83SYohann     CeedChk(ierr);
8744d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
875241a4b83SYohann     CeedChk(ierr);
876920dcdc4Sjeremylt 
877920dcdc4Sjeremylt     // Set field constants
878920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT) {
879920dcdc4Sjeremylt       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
880920dcdc4Sjeremylt       if (basis != CEED_BASIS_COLLOCATED) {
881920dcdc4Sjeremylt         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
882920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
883920dcdc4Sjeremylt       } else {
884920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
885920dcdc4Sjeremylt       }
886920dcdc4Sjeremylt       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
887920dcdc4Sjeremylt     }
888920dcdc4Sjeremylt 
889920dcdc4Sjeremylt     // Load basis data
890920dcdc4Sjeremylt     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
891241a4b83SYohann     switch (emode) {
892241a4b83SYohann     case CEED_EVAL_NONE:
893241a4b83SYohann       break;
894241a4b83SYohann     case CEED_EVAL_INTERP:
895241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
896241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
897241a4b83SYohann       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
898241a4b83SYohann       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
899241a4b83SYohann       break;
900241a4b83SYohann     case CEED_EVAL_GRAD:
901241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
902241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
903241a4b83SYohann       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
904241a4b83SYohann       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
905ac421f39SYohann       if (basis_data->d_collograd1d) {
906ac421f39SYohann         data->G.in[i] = basis_data->d_collograd1d;
907ac421f39SYohann         code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
908ac421f39SYohann         code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
909ac421f39SYohann       } else {
910ac421f39SYohann         data->G.in[i] = basis_data->d_grad1d;
911ac421f39SYohann         code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
912241a4b83SYohann         code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
913ac421f39SYohann       }
914241a4b83SYohann       break;
915241a4b83SYohann     case CEED_EVAL_WEIGHT:
916241a4b83SYohann       break; // No action
917241a4b83SYohann     case CEED_EVAL_DIV:
918241a4b83SYohann       break; // TODO: Not implemented
919241a4b83SYohann     case CEED_EVAL_CURL:
920241a4b83SYohann       break; // TODO: Not implemented
921241a4b83SYohann     }
922241a4b83SYohann   }
923920dcdc4Sjeremylt 
924920dcdc4Sjeremylt   code << "\n// Output field constants and basis data\n";
925241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
926920dcdc4Sjeremylt     code << "// -- Output field "<<i<<" --\n";
927241a4b83SYohann     // Get elemsize, emode, ncomp
928241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
929241a4b83SYohann     CeedChk(ierr);
930241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
931241a4b83SYohann     CeedChk(ierr);
932241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
933241a4b83SYohann     CeedChk(ierr);
9344d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
935241a4b83SYohann     CeedChk(ierr);
936920dcdc4Sjeremylt 
937920dcdc4Sjeremylt     // Set field constants
938241a4b83SYohann     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
939920dcdc4Sjeremylt     if (basis != CEED_BASIS_COLLOCATED) {
940241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
941241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
942920dcdc4Sjeremylt     } else {
943920dcdc4Sjeremylt       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
944920dcdc4Sjeremylt     }
945920dcdc4Sjeremylt     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
946920dcdc4Sjeremylt 
947920dcdc4Sjeremylt     // Load basis data
948920dcdc4Sjeremylt     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
949920dcdc4Sjeremylt     switch (emode) {
950920dcdc4Sjeremylt     case CEED_EVAL_NONE:
951920dcdc4Sjeremylt       break; // No action
952920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
953241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
954241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
955241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
956241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
957241a4b83SYohann       break;
958241a4b83SYohann     case CEED_EVAL_GRAD:
959241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
960241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
961241a4b83SYohann       code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
962241a4b83SYohann       code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
963ac421f39SYohann       if (basis_data->d_collograd1d) {
964ac421f39SYohann         data->G.out[i] = basis_data->d_collograd1d;
965ac421f39SYohann         code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
966ac421f39SYohann         code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
967ac421f39SYohann       } else {
968ac421f39SYohann         data->G.out[i] = basis_data->d_grad1d;
969ac421f39SYohann         code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
970241a4b83SYohann         code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
971ac421f39SYohann       }
972241a4b83SYohann       break;
973241a4b83SYohann     case CEED_EVAL_WEIGHT: {
974241a4b83SYohann       Ceed ceed;
975241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
976241a4b83SYohann       return CeedError(ceed, 1,
977241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
978241a4b83SYohann       break; // Should not occur
979241a4b83SYohann     }
980241a4b83SYohann     case CEED_EVAL_DIV:
981241a4b83SYohann       break; // TODO: Not implemented
982241a4b83SYohann     case CEED_EVAL_CURL:
983241a4b83SYohann       break; // TODO: Not implemented
984241a4b83SYohann     }
985241a4b83SYohann   }
986ac421f39SYohann   code << "\n";
987ac421f39SYohann   code << "__syncthreads();\n";
988241a4b83SYohann   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
989241a4b83SYohann   // Input basis apply if needed
990ac421f39SYohann   // Generate the correct eval mode code for each input
991920dcdc4Sjeremylt   code << "\n// Input field restrictions and basis actions\n";
992241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
993920dcdc4Sjeremylt     code << "  // -- Input field "<<i<<" --\n";
994241a4b83SYohann     // Get elemsize, emode, ncomp
995241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
996241a4b83SYohann     CeedChk(ierr);
997241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
998241a4b83SYohann     CeedChk(ierr);
999241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1000241a4b83SYohann     CeedChk(ierr);
10014d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1002241a4b83SYohann     CeedChk(ierr);
1003920dcdc4Sjeremylt 
1004920dcdc4Sjeremylt     // Restriction
1005920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT &&
1006920dcdc4Sjeremylt         !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) {
1007241a4b83SYohann       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
10089a2291e3SJeremy L Thompson 
10099a2291e3SJeremy L Thompson       bool isStrided;
1010fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
10119a2291e3SJeremy L Thompson       if (!isStrided) {
10125c7b696cSjeremylt         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
10135c7b696cSjeremylt         CeedChk(ierr);
10145c7b696cSjeremylt         code << "  const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
10155c7b696cSjeremylt         CeedInt compstride;
10165c7b696cSjeremylt         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
10175c7b696cSjeremylt         code << "  // CompStride: "<<compstride<<"\n";
10189a2291e3SJeremy L Thompson         ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
10199a2291e3SJeremy L Thompson         data->indices.in[i] = restr_data->d_ind;
10205c7b696cSjeremylt         code << "  readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1021ccedf6b0Sjeremylt       } else {
1022920dcdc4Sjeremylt         bool backendstrides;
1023fd364f38SJeremy L Thompson         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1024920dcdc4Sjeremylt         CeedChk(ierr);
1025920dcdc4Sjeremylt         CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1026920dcdc4Sjeremylt         if (!backendstrides) {
1027920dcdc4Sjeremylt           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1028920dcdc4Sjeremylt           CeedChk(ierr);
1029ccedf6b0Sjeremylt         }
1030920dcdc4Sjeremylt         code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1031d80fc06aSjeremylt         code << "  readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1032920dcdc4Sjeremylt       }
1033920dcdc4Sjeremylt     }
1034920dcdc4Sjeremylt 
1035920dcdc4Sjeremylt     // Basis action
1036920dcdc4Sjeremylt     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1037920dcdc4Sjeremylt     switch (emode) {
1038920dcdc4Sjeremylt     case CEED_EVAL_NONE:
1039920dcdc4Sjeremylt       if (!basis_data->d_collograd1d) {
1040920dcdc4Sjeremylt         code << "  CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1041920dcdc4Sjeremylt       }
1042920dcdc4Sjeremylt       break;
1043920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
1044241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1045241a4b83SYohann       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1046241a4b83SYohann       break;
1047241a4b83SYohann     case CEED_EVAL_GRAD:
1048ac421f39SYohann       if (basis_data->d_collograd1d) {
1049ac421f39SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1050ac421f39SYohann         code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1051ac421f39SYohann       } else {
1052241a4b83SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1053241a4b83SYohann         code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1054ac421f39SYohann       }
1055241a4b83SYohann       break;
1056241a4b83SYohann     case CEED_EVAL_WEIGHT:
1057241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
1058241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1059241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1060241a4b83SYohann       data->W = basis_data->d_qweight1d;
1061241a4b83SYohann       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1062241a4b83SYohann       break; // No action
1063241a4b83SYohann     case CEED_EVAL_DIV:
1064241a4b83SYohann       break; // TODO: Not implemented
1065241a4b83SYohann     case CEED_EVAL_CURL:
1066241a4b83SYohann       break; // TODO: Not implemented
1067241a4b83SYohann     }
1068241a4b83SYohann   }
1069ac421f39SYohann 
1070241a4b83SYohann   // Q function
1071920dcdc4Sjeremylt   code << "\n// Output field setup\n";
1072241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1073920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1074241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1075241a4b83SYohann     CeedChk(ierr);
1076241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
1077241a4b83SYohann     {
1078ac421f39SYohann       if (basis_data->d_collograd1d) {
1079ac421f39SYohann         //Accumulator for gradient slices
1080ac421f39SYohann         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1081ac421f39SYohann         code << "  for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1082ac421f39SYohann         code << "    for (CeedInt j = 0; j < Q1d; ++j) {\n";
1083ac421f39SYohann         code << "      r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1084ac421f39SYohann         code << "    }\n";
1085ac421f39SYohann         code << "  }\n";
1086ac421f39SYohann       } else {
1087241a4b83SYohann         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1088241a4b83SYohann       }
1089ac421f39SYohann     }
1090241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1091241a4b83SYohann     {
1092241a4b83SYohann       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1093241a4b83SYohann     }
1094241a4b83SYohann   }
1095ac421f39SYohann   // We treat quadrature points per slice in 3d to save registers
1096ac421f39SYohann   if (basis_data->d_collograd1d) {
1097920dcdc4Sjeremylt     code << "\n  // Note: Collocated Gradient\n";
1098ac421f39SYohann     code << "#pragma unroll\n";
1099ac421f39SYohann     code << "for (CeedInt q=0; q<Q1d; q++) {\n";
1100ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1101920dcdc4Sjeremylt       code << "  // -- Input field "<<i<<" --\n";
1102ac421f39SYohann       // Get elemsize, emode, ncomp
1103ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1104ac421f39SYohann       CeedChk(ierr);
1105ac421f39SYohann       // Basis action
1106920dcdc4Sjeremylt       code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1107ac421f39SYohann       switch (emode) {
1108ac421f39SYohann       case CEED_EVAL_NONE:
1109ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11109a2291e3SJeremy L Thompson 
11119a2291e3SJeremy L Thompson         bool isStrided;
1112fd364f38SJeremy L Thompson         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
11139a2291e3SJeremy L Thompson         if (!isStrided) {
11145c7b696cSjeremylt           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
11155c7b696cSjeremylt           CeedChk(ierr);
11165c7b696cSjeremylt           code << "  const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
11175c7b696cSjeremylt           CeedInt compstride;
11185c7b696cSjeremylt           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
11195c7b696cSjeremylt           code << "  // CompStride: "<<compstride<<"\n";
11209a2291e3SJeremy L Thompson           ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
11219a2291e3SJeremy L Thompson           data->indices.in[i] = restr_data->d_ind;
11225c7b696cSjeremylt           code << "  readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1123920dcdc4Sjeremylt         } else {
1124920dcdc4Sjeremylt           bool backendstrides;
1125fd364f38SJeremy L Thompson           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1126920dcdc4Sjeremylt           CeedChk(ierr);
1127920dcdc4Sjeremylt           CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1128920dcdc4Sjeremylt           if (!backendstrides) {
1129920dcdc4Sjeremylt             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1130920dcdc4Sjeremylt             CeedChk(ierr);
1131920dcdc4Sjeremylt           }
1132920dcdc4Sjeremylt           code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1133d80fc06aSjeremylt           code << "  readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1134920dcdc4Sjeremylt         }
1135ac421f39SYohann         break;
1136ac421f39SYohann       case CEED_EVAL_INTERP:
1137ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1138ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1139ac421f39SYohann         code << "    r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1140ac421f39SYohann         code << "  }\n";
1141ac421f39SYohann         break;
1142ac421f39SYohann       case CEED_EVAL_GRAD:
1143ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1144ac421f39SYohann         code << "  gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1145ac421f39SYohann         break;
1146ac421f39SYohann       case CEED_EVAL_WEIGHT:
1147ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[1];\n";
1148ac421f39SYohann         code << "  r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1149ac421f39SYohann         break; // No action
1150ac421f39SYohann       case CEED_EVAL_DIV:
1151ac421f39SYohann         break; // TODO: Not implemented
1152ac421f39SYohann       case CEED_EVAL_CURL:
1153ac421f39SYohann         break; // TODO: Not implemented
1154ac421f39SYohann       }
1155ac421f39SYohann     }
1156ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1157920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1158ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1159ac421f39SYohann       CeedChk(ierr);
1160ac421f39SYohann       // Basis action
1161ac421f39SYohann       switch (emode) {
1162ac421f39SYohann       case CEED_EVAL_NONE:
1163ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1164ac421f39SYohann         break; // No action
1165ac421f39SYohann       case CEED_EVAL_INTERP:
1166ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1167ac421f39SYohann         break;
1168ac421f39SYohann       case CEED_EVAL_GRAD:
1169ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1170ac421f39SYohann         break;
1171ac421f39SYohann       case CEED_EVAL_WEIGHT:
1172ac421f39SYohann         break; // Should not occur
1173ac421f39SYohann       case CEED_EVAL_DIV:
1174ac421f39SYohann         break; // TODO: Not implemented
1175ac421f39SYohann       case CEED_EVAL_CURL:
1176ac421f39SYohann         break; // TODO: Not implemented
1177ac421f39SYohann       }
1178ac421f39SYohann     }
1179ac421f39SYohann   } else {
1180920dcdc4Sjeremylt       code << "\n  // Note: No Collocated Gradient\n";
1181ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1182920dcdc4Sjeremylt       code << "  // -- Input field "<<i<<" --\n";
1183ac421f39SYohann       code << "  CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1184ac421f39SYohann     }
1185ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1186920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1187ac421f39SYohann       code << "  CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1188ac421f39SYohann     }
1189ac421f39SYohann   }
1190920dcdc4Sjeremylt   code << "  // QFunction Inputs and outputs\n";
11914d537eeaSYohann   code << "  CeedScalar* in["<<numinputfields<<"];\n";
11924d537eeaSYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1193920dcdc4Sjeremylt     code << "  // -- Input field "<<i<<" --\n";
1194ac421f39SYohann     code << "  in["<<i<<"] = r_q"<<i<<";\n";
11954d537eeaSYohann   }
11964d537eeaSYohann   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
11974d537eeaSYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1198920dcdc4Sjeremylt     code << "  // -- Output field "<<i<<" --\n";
1199ac421f39SYohann     code << "  out["<<i<<"] = r_qq"<<i<<";\n";
12004d537eeaSYohann   }
1201920dcdc4Sjeremylt   code << "\n  // Apply QFunction\n";
1202ac421f39SYohann   code << "  "<<qFunctionName<<"(ctx, ";
1203ac421f39SYohann   if (dim != 3 || basis_data->d_collograd1d) {
1204ac421f39SYohann     code << "1 ";
1205ac421f39SYohann   } else {
1206ac421f39SYohann     code << "Q1d ";
1207ac421f39SYohann   }
1208ac421f39SYohann   code << ", in, out);\n";
1209ac421f39SYohann   if (basis_data->d_collograd1d) {
1210920dcdc4Sjeremylt     code << "\n  // Note: Collocated Gradient\n";
1211ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1212920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1213ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1214ac421f39SYohann       CeedChk(ierr);
1215ac421f39SYohann       // Basis action
1216920dcdc4Sjeremylt       code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1217ac421f39SYohann       switch (emode) {
1218ac421f39SYohann       case CEED_EVAL_NONE:
1219ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1220ac421f39SYohann         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1221ac421f39SYohann         code << "  }\n";
1222ac421f39SYohann         break; // No action
1223ac421f39SYohann       case CEED_EVAL_INTERP:
1224ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1225ac421f39SYohann         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1226ac421f39SYohann         code << "  }\n";
1227ac421f39SYohann         break;
1228ac421f39SYohann       case CEED_EVAL_GRAD:
1229ac421f39SYohann         code << "  gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1230ac421f39SYohann         break;
1231ac421f39SYohann       case CEED_EVAL_WEIGHT:
1232ac421f39SYohann         break; // Should not occur
1233ac421f39SYohann       case CEED_EVAL_DIV:
1234ac421f39SYohann         break; // TODO: Not implemented
1235ac421f39SYohann       case CEED_EVAL_CURL:
1236ac421f39SYohann         break; // TODO: Not implemented
1237ac421f39SYohann       }
1238ac421f39SYohann     }
1239ac421f39SYohann     code << "}\n";
1240ac421f39SYohann   }
1241241a4b83SYohann 
1242241a4b83SYohann   // Output basis apply if needed
1243ac421f39SYohann   // Generate the correct eval mode code for each output
1244920dcdc4Sjeremylt   code << "\n// Output field basis action and restrictions\n";
1245241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1246920dcdc4Sjeremylt     code << "  // -- Output field "<<i<<" --\n";
1247241a4b83SYohann     // Get elemsize, emode, ncomp
1248241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1249241a4b83SYohann     CeedChk(ierr);
1250241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1251241a4b83SYohann     CeedChk(ierr);
1252241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1253241a4b83SYohann     CeedChk(ierr);
12544d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1255241a4b83SYohann     CeedChk(ierr);
1256241a4b83SYohann     // Basis action
1257920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1258241a4b83SYohann     switch (emode) {
1259241a4b83SYohann     case CEED_EVAL_NONE:
1260920dcdc4Sjeremylt       code << "  CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1261241a4b83SYohann       break; // No action
1262241a4b83SYohann     case CEED_EVAL_INTERP:
1263241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1264241a4b83SYohann       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1265241a4b83SYohann       break;
1266241a4b83SYohann     case CEED_EVAL_GRAD:
1267241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1268ac421f39SYohann       if (basis_data->d_collograd1d) {
1269ac421f39SYohann         code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1270ac421f39SYohann       } else {
1271241a4b83SYohann         code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1272ac421f39SYohann       }
1273241a4b83SYohann       break;
1274241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1275241a4b83SYohann       Ceed ceed;
1276241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1277241a4b83SYohann       return CeedError(ceed, 1,
1278241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1279241a4b83SYohann       break; // Should not occur
1280241a4b83SYohann     }
1281241a4b83SYohann     case CEED_EVAL_DIV:
1282241a4b83SYohann       break; // TODO: Not implemented
1283241a4b83SYohann     case CEED_EVAL_CURL:
1284241a4b83SYohann       break; // TODO: Not implemented
1285241a4b83SYohann     }
1286920dcdc4Sjeremylt     // Restriction
12879a2291e3SJeremy L Thompson       bool isStrided;
1288fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
12899a2291e3SJeremy L Thompson     if (!isStrided) {
12905c7b696cSjeremylt       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
12915c7b696cSjeremylt       CeedChk(ierr);
12925c7b696cSjeremylt       code << "  const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
12935c7b696cSjeremylt       CeedInt compstride;
12945c7b696cSjeremylt       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
12955c7b696cSjeremylt       code << "  // CompStride: "<<compstride<<"\n";
12969a2291e3SJeremy L Thompson       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
12979a2291e3SJeremy L Thompson       data->indices.out[i] = restr_data->d_ind;
12985c7b696cSjeremylt       code << "  writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1299920dcdc4Sjeremylt     } else {
1300920dcdc4Sjeremylt       bool backendstrides;
1301fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1302920dcdc4Sjeremylt       CeedChk(ierr);
1303920dcdc4Sjeremylt       CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1304920dcdc4Sjeremylt       if (!backendstrides) {
1305920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1306920dcdc4Sjeremylt         CeedChk(ierr);
1307920dcdc4Sjeremylt       }
1308920dcdc4Sjeremylt       code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1309d80fc06aSjeremylt       code << "  writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1310920dcdc4Sjeremylt     }
1311241a4b83SYohann   }
1312241a4b83SYohann 
1313241a4b83SYohann   code << "  }\n";
1314241a4b83SYohann   code << "}\n\n";
1315241a4b83SYohann 
1316ab213215SJeremy L Thompson   // View kernel for debugging
1317241a4b83SYohann   // std::cout << code.str();
1318241a4b83SYohann 
1319920dcdc4Sjeremylt   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0);
1320920dcdc4Sjeremylt   CeedChk(ierr);
1321*c3c443faSYohann Dudouit   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1322241a4b83SYohann   CeedChk(ierr);
1323241a4b83SYohann 
1324241a4b83SYohann   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1325241a4b83SYohann   return 0;
1326241a4b83SYohann }
1327ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1328