xref: /libCEED/rust/libceed-sys/c-src/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision e15f9bd09af0280c89b79924fa9af7dd2e3e30be)
1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4241a4b83SYohann //
5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7241a4b83SYohann // element discretizations for exascale applications. For more information and
8241a4b83SYohann // source code availability see http://github.com/ceed.
9241a4b83SYohann //
10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
163d576824SJeremy L Thompson 
17b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12
187df94212SJeremy L Thompson 
193d576824SJeremy L Thompson #include <ceed.h>
203d576824SJeremy L Thompson #include <ceed-backend.h>
213d576824SJeremy L Thompson #include <cuda_runtime.h>
22241a4b83SYohann #include <iostream>
23241a4b83SYohann #include <sstream>
243d576824SJeremy L Thompson #include "ceed-cuda-gen.h"
25241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
26241a4b83SYohann 
27f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
28ab213215SJeremy L Thompson //------------------------------------------------------------------------------
29ab213215SJeremy L Thompson // Atomic add, for older CUDA
30ab213215SJeremy L Thompson //------------------------------------------------------------------------------
31241a4b83SYohann __device__ double atomicAdd(double *address, double val) {
32241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
33241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
34241a4b83SYohann   do {
35241a4b83SYohann     assumed = old;
36241a4b83SYohann     old =
37241a4b83SYohann       atomicCAS(address_as_ull, assumed,
38241a4b83SYohann                 __double_as_longlong(val +
39241a4b83SYohann                                      __longlong_as_double(assumed)));
40241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
41241a4b83SYohann     // (since NaN != NaN)
42241a4b83SYohann   } while (assumed != old);
43241a4b83SYohann   return __longlong_as_double(old);
44241a4b83SYohann }
45f1a13f77SYohann Dudouit );
46f1a13f77SYohann Dudouit 
47f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
48f1a13f77SYohann Dudouit 
49ab213215SJeremy L Thompson //------------------------------------------------------------------------------
50ab213215SJeremy L Thompson // Typedefs
51ab213215SJeremy L Thompson //------------------------------------------------------------------------------
52f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
53f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
54f1a13f77SYohann Dudouit 
55f1a13f77SYohann Dudouit typedef struct {
56f1a13f77SYohann Dudouit   CeedInt tidx;
57f1a13f77SYohann Dudouit   CeedInt tidy;
58f1a13f77SYohann Dudouit   CeedInt tidz;
59f1a13f77SYohann Dudouit   CeedInt tid;
60f1a13f77SYohann Dudouit   CeedScalar* slice;
61f1a13f77SYohann Dudouit } BackendData;
62241a4b83SYohann 
63ab213215SJeremy L Thompson //------------------------------------------------------------------------------
64ab213215SJeremy L Thompson // Load matrices for basis actions
65ab213215SJeremy L Thompson //------------------------------------------------------------------------------
66241a4b83SYohann template <int P, int Q>
67241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
68ab213215SJeremy L Thompson   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
69241a4b83SYohann     B[i] = d_B[i];
70241a4b83SYohann }
71241a4b83SYohann 
72ab213215SJeremy L Thompson //------------------------------------------------------------------------------
73241a4b83SYohann // 1D
74ab213215SJeremy L Thompson //------------------------------------------------------------------------------
75ab213215SJeremy L Thompson 
76ab213215SJeremy L Thompson //------------------------------------------------------------------------------
77ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
78ab213215SJeremy L Thompson //------------------------------------------------------------------------------
795c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
805c7b696cSjeremylt inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
81ab213215SJeremy L Thompson   if (data.tidx < P1d) {
824d537eeaSYohann     const CeedInt node = data.tidx;
83ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
84ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
855c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
86241a4b83SYohann   }
87241a4b83SYohann }
88920dcdc4Sjeremylt 
89ab213215SJeremy L Thompson //------------------------------------------------------------------------------
90ab213215SJeremy L Thompson // L-vector -> E-vector, strided
91ab213215SJeremy L Thompson //------------------------------------------------------------------------------
92d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
93d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
94ab213215SJeremy L Thompson   if (data.tidx < P1d) {
95ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
96d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
97ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
98d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
99ccedf6b0Sjeremylt   }
100ccedf6b0Sjeremylt }
101241a4b83SYohann 
102ab213215SJeremy L Thompson //------------------------------------------------------------------------------
103ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
104ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1055c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
1065c7b696cSjeremylt inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
107ab213215SJeremy L Thompson   if (data.tidx < P1d) {
1084d537eeaSYohann     const CeedInt node = data.tidx;
109ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
110ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1115c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
112241a4b83SYohann   }
113241a4b83SYohann }
114241a4b83SYohann 
115ab213215SJeremy L Thompson //------------------------------------------------------------------------------
116ab213215SJeremy L Thompson // E-vector -> L-vector, strided
117ab213215SJeremy L Thompson //------------------------------------------------------------------------------
118d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
119d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
120ab213215SJeremy L Thompson   if (data.tidx < P1d) {
121ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
122d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
123ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
124d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
125ccedf6b0Sjeremylt   }
126ccedf6b0Sjeremylt }
127ccedf6b0Sjeremylt 
128ab213215SJeremy L Thompson //------------------------------------------------------------------------------
129ab213215SJeremy L Thompson // 1D tensor contraction x
130ab213215SJeremy L Thompson //------------------------------------------------------------------------------
131241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
132ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
133241a4b83SYohann   data.slice[data.tidx] = *U;
134241a4b83SYohann   __syncthreads();
135241a4b83SYohann   *V = 0.0;
13618d499f1SYohann   if (data.tidx < Q1d)
137ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
138ab213215SJeremy L Thompson       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
139241a4b83SYohann   __syncthreads();
140241a4b83SYohann }
141241a4b83SYohann 
142ab213215SJeremy L Thompson //------------------------------------------------------------------------------
143ab213215SJeremy L Thompson // 1D transpose tensor contraction x
144ab213215SJeremy L Thompson //------------------------------------------------------------------------------
145241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
146ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
147241a4b83SYohann   data.slice[data.tidx] = *U;
148241a4b83SYohann   __syncthreads();
149241a4b83SYohann   *V = 0.0;
15018d499f1SYohann   if (data.tidx < P1d)
151ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
152ab213215SJeremy L Thompson       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
153241a4b83SYohann   __syncthreads();
154241a4b83SYohann }
155241a4b83SYohann 
156ab213215SJeremy L Thompson //------------------------------------------------------------------------------
157ab213215SJeremy L Thompson // 1D interpolate to quadrature points
158ab213215SJeremy L Thompson //------------------------------------------------------------------------------
159241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
160ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
161ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
162241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
163241a4b83SYohann }
164241a4b83SYohann 
165ab213215SJeremy L Thompson //------------------------------------------------------------------------------
166ab213215SJeremy L Thompson // 1D interpolate transpose
167ab213215SJeremy L Thompson //------------------------------------------------------------------------------
168241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
169ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
170ab213215SJeremy L Thompson   for (CeedInt comp=0; comp<NCOMP; comp++)
171241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
172241a4b83SYohann }
173241a4b83SYohann 
174ab213215SJeremy L Thompson //------------------------------------------------------------------------------
175ab213215SJeremy L Thompson // 1D derivatives at quadrature points
176ab213215SJeremy L Thompson //------------------------------------------------------------------------------
177241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
178ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
179ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
180241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
181241a4b83SYohann }
182241a4b83SYohann 
183ab213215SJeremy L Thompson //------------------------------------------------------------------------------
184ab213215SJeremy L Thompson // 1D derivatives transpose
185ab213215SJeremy L Thompson //------------------------------------------------------------------------------
186241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
187ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
188ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
189241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
190241a4b83SYohann }
191241a4b83SYohann 
192ab213215SJeremy L Thompson //------------------------------------------------------------------------------
193241a4b83SYohann // 2D
194ab213215SJeremy L Thompson //------------------------------------------------------------------------------
195ab213215SJeremy L Thompson 
196ab213215SJeremy L Thompson //------------------------------------------------------------------------------
197ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
198ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1995c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
2005c7b696cSjeremylt inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
201ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
2024d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
203ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
204ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2055c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
206241a4b83SYohann   }
207241a4b83SYohann }
208920dcdc4Sjeremylt 
209ab213215SJeremy L Thompson //------------------------------------------------------------------------------
210ab213215SJeremy L Thompson // L-vector -> E-vector, strided
211ab213215SJeremy L Thompson //------------------------------------------------------------------------------
212d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
213d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
214ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
215ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
216d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
217ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
218d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
219ccedf6b0Sjeremylt   }
220ccedf6b0Sjeremylt }
221241a4b83SYohann 
222ab213215SJeremy L Thompson //------------------------------------------------------------------------------
223ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
224ab213215SJeremy L Thompson //------------------------------------------------------------------------------
2255c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
2265c7b696cSjeremylt inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
227ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
2284d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
229ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
230ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2315c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
232241a4b83SYohann   }
233241a4b83SYohann }
234241a4b83SYohann 
235ab213215SJeremy L Thompson //------------------------------------------------------------------------------
236ab213215SJeremy L Thompson // E-vector -> L-vector, strided
237ab213215SJeremy L Thompson //------------------------------------------------------------------------------
238d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
239d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
240ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
241ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
242d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
243ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
244d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
245ccedf6b0Sjeremylt   }
246ccedf6b0Sjeremylt }
247ccedf6b0Sjeremylt 
248ab213215SJeremy L Thompson //------------------------------------------------------------------------------
249ab213215SJeremy L Thompson // 2D tensor contraction x
250ab213215SJeremy L Thompson //------------------------------------------------------------------------------
251241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
252ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
25318d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
254241a4b83SYohann   __syncthreads();
255241a4b83SYohann   *V = 0.0;
25618d499f1SYohann   if (data.tidx < Q1d && data.tidy < P1d)
257ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
25818d499f1SYohann       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
259241a4b83SYohann   __syncthreads();
260241a4b83SYohann }
261241a4b83SYohann 
262ab213215SJeremy L Thompson //------------------------------------------------------------------------------
263ab213215SJeremy L Thompson // 2D tensor contract y
264ab213215SJeremy L Thompson //------------------------------------------------------------------------------
265241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
266ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
26718d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
268241a4b83SYohann   __syncthreads();
269241a4b83SYohann   *V = 0.0;
27018d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d)
271ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
27218d499f1SYohann       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
273241a4b83SYohann   __syncthreads();
274241a4b83SYohann }
275241a4b83SYohann 
276ab213215SJeremy L Thompson //------------------------------------------------------------------------------
277ab213215SJeremy L Thompson // 2D transpose tensor contract y
278ab213215SJeremy L Thompson //------------------------------------------------------------------------------
279241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
280ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
28118d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
282241a4b83SYohann   __syncthreads();
283241a4b83SYohann   *V = 0.0;
28418d499f1SYohann   if (data.tidx < Q1d && data.tidy < P1d)
285ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
28618d499f1SYohann       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
287241a4b83SYohann   __syncthreads();
288241a4b83SYohann }
289241a4b83SYohann 
290ab213215SJeremy L Thompson //------------------------------------------------------------------------------
291ab213215SJeremy L Thompson // 2D transpose tensor contract x
292ab213215SJeremy L Thompson //------------------------------------------------------------------------------
293241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
294ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
29518d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
296241a4b83SYohann   __syncthreads();
297241a4b83SYohann   *V = 0.0;
29818d499f1SYohann   if (data.tidx < P1d && data.tidy < P1d)
299ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
30018d499f1SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
301241a4b83SYohann   __syncthreads();
302241a4b83SYohann }
303241a4b83SYohann 
304ab213215SJeremy L Thompson //------------------------------------------------------------------------------
305ab213215SJeremy L Thompson // 2D transpose tensor contract and add x
306ab213215SJeremy L Thompson //------------------------------------------------------------------------------
307241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
308ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
30918d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
310241a4b83SYohann   __syncthreads();
31118d499f1SYohann   if (data.tidx < P1d && data.tidy < P1d)
312ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
31318d499f1SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
314241a4b83SYohann   __syncthreads();
315241a4b83SYohann }
316241a4b83SYohann 
317ab213215SJeremy L Thompson //------------------------------------------------------------------------------
318ab213215SJeremy L Thompson // 2D interpolate to quadrature points
319ab213215SJeremy L Thompson //------------------------------------------------------------------------------
320241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
321ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
322241a4b83SYohann   CeedScalar r_t[1];
323ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
324241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
325241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
326241a4b83SYohann   }
327241a4b83SYohann }
328241a4b83SYohann 
329ab213215SJeremy L Thompson //------------------------------------------------------------------------------
330ab213215SJeremy L Thompson // 2D interpolate transpose
331ab213215SJeremy L Thompson //------------------------------------------------------------------------------
332241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
333ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
334241a4b83SYohann   CeedScalar r_t[1];
335ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
336241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
337241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
338241a4b83SYohann   }
339241a4b83SYohann }
340241a4b83SYohann 
341ab213215SJeremy L Thompson //------------------------------------------------------------------------------
342ab213215SJeremy L Thompson // 2D derivatives at quadrature points
343ab213215SJeremy L Thompson //------------------------------------------------------------------------------
344241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
345ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
346241a4b83SYohann   CeedScalar r_t[1];
347ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
348241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
349241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
350241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
351241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
352241a4b83SYohann   }
353241a4b83SYohann }
354241a4b83SYohann 
355ab213215SJeremy L Thompson //------------------------------------------------------------------------------
356ab213215SJeremy L Thompson // 2D derivatives transpose
357ab213215SJeremy L Thompson //------------------------------------------------------------------------------
358241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
359ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
360241a4b83SYohann   CeedScalar r_t[1];
361ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
362241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
363241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
364241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
365241a4b83SYohann     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
366241a4b83SYohann   }
367241a4b83SYohann }
368241a4b83SYohann 
369ab213215SJeremy L Thompson //------------------------------------------------------------------------------
370241a4b83SYohann // 3D
371ab213215SJeremy L Thompson //------------------------------------------------------------------------------
372ab213215SJeremy L Thompson 
373ab213215SJeremy L Thompson //------------------------------------------------------------------------------
374ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
375ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3765c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
3775c7b696cSjeremylt inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
378ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
379241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
3804d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
381ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
382ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3835c7b696cSjeremylt         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
384241a4b83SYohann     }
385241a4b83SYohann }
386920dcdc4Sjeremylt 
387ab213215SJeremy L Thompson //------------------------------------------------------------------------------
388ab213215SJeremy L Thompson // L-vector -> E-vector, strided
389ab213215SJeremy L Thompson //------------------------------------------------------------------------------
390d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
391d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
392ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
393ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
394ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
395d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
396ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
397d80fc06aSjeremylt         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
398ccedf6b0Sjeremylt     }
399ccedf6b0Sjeremylt }
400241a4b83SYohann 
401ab213215SJeremy L Thompson //------------------------------------------------------------------------------
402ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided
403ab213215SJeremy L Thompson //------------------------------------------------------------------------------
4045c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d>
4055c7b696cSjeremylt inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
40618d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
407ac421f39SYohann     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
408920dcdc4Sjeremylt     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
409ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4105c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
411ac421f39SYohann   }
41218d499f1SYohann }
413ac421f39SYohann 
414ab213215SJeremy L Thompson //------------------------------------------------------------------------------
415ab213215SJeremy L Thompson // E-vector -> Q-vector, strided
416ab213215SJeremy L Thompson //------------------------------------------------------------------------------
417d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
41825dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
41918d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
420920dcdc4Sjeremylt     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
421d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
422ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
423d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
424920dcdc4Sjeremylt   }
42518d499f1SYohann }
426920dcdc4Sjeremylt 
427ab213215SJeremy L Thompson //------------------------------------------------------------------------------
428ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
429ab213215SJeremy L Thompson //------------------------------------------------------------------------------
4305c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
4315c7b696cSjeremylt inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
432ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
433241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4344d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
435ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
436ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4375c7b696cSjeremylt         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
438241a4b83SYohann     }
439241a4b83SYohann }
440241a4b83SYohann 
441ab213215SJeremy L Thompson //------------------------------------------------------------------------------
442ab213215SJeremy L Thompson // E-vector -> L-vector, strided
443ab213215SJeremy L Thompson //------------------------------------------------------------------------------
444d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4452f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
446ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
447ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
448ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
449d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
450ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
451d80fc06aSjeremylt         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
452ccedf6b0Sjeremylt     }
453ccedf6b0Sjeremylt }
454ccedf6b0Sjeremylt 
455ab213215SJeremy L Thompson //------------------------------------------------------------------------------
456ab213215SJeremy L Thompson // 3D tensor contract x
457ab213215SJeremy L Thompson //------------------------------------------------------------------------------
458241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
459ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
460ac421f39SYohann   CeedScalar r_B[P1d];
461ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
462ac421f39SYohann     r_B[i] = B[i + data.tidx*P1d];
463ab213215SJeremy L Thompson 
464ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
46518d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
466241a4b83SYohann     __syncthreads();
467241a4b83SYohann     V[k] = 0.0;
46818d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
469ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
47018d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
471241a4b83SYohann     __syncthreads();
472241a4b83SYohann   }
473241a4b83SYohann }
474241a4b83SYohann 
475ab213215SJeremy L Thompson //------------------------------------------------------------------------------
476ab213215SJeremy L Thompson // 3D tensor contract y
477ab213215SJeremy L Thompson //------------------------------------------------------------------------------
478241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
479ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
480ac421f39SYohann   CeedScalar r_B[P1d];
481ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
482ac421f39SYohann     r_B[i] = B[i + data.tidy*P1d];
483ab213215SJeremy L Thompson 
484ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
48518d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
486241a4b83SYohann     __syncthreads();
487241a4b83SYohann     V[k] = 0.0;
48818d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
489ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
49018d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
491241a4b83SYohann     __syncthreads();
492241a4b83SYohann   }
493241a4b83SYohann }
494241a4b83SYohann 
495ab213215SJeremy L Thompson //------------------------------------------------------------------------------
496ab213215SJeremy L Thompson // 3D tensor contract z
497ab213215SJeremy L Thompson //------------------------------------------------------------------------------
498241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
499ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
500ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
501241a4b83SYohann     V[k] = 0.0;
50218d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
503ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
504ab213215SJeremy L Thompson         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
505241a4b83SYohann   }
506241a4b83SYohann }
507241a4b83SYohann 
508ab213215SJeremy L Thompson //------------------------------------------------------------------------------
509ab213215SJeremy L Thompson // 3D transpose tensor contract z
510ab213215SJeremy L Thompson //------------------------------------------------------------------------------
511241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
512ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
51318d499f1SYohann   for (CeedInt k = 0; k < P1d; ++k) {
514241a4b83SYohann     V[k] = 0.0;
51518d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
516ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
517ab213215SJeremy L Thompson         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
518241a4b83SYohann   }
519241a4b83SYohann }
520241a4b83SYohann 
521ab213215SJeremy L Thompson //------------------------------------------------------------------------------
522ab213215SJeremy L Thompson // 3D transpose tensor contract y
523ab213215SJeremy L Thompson //------------------------------------------------------------------------------
524241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
525ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
526ac421f39SYohann   CeedScalar r_B[Q1d];
527ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i)
528ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
529ab213215SJeremy L Thompson 
530ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
53118d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
532241a4b83SYohann     __syncthreads();
533241a4b83SYohann     V[k] = 0.0;
53418d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
535ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
53618d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
537ac421f39SYohann     __syncthreads();
538ac421f39SYohann   }
539ac421f39SYohann }
540ac421f39SYohann 
541ab213215SJeremy L Thompson //------------------------------------------------------------------------------
542ab213215SJeremy L Thompson // 3D transpose tensor contract add y
543ab213215SJeremy L Thompson //------------------------------------------------------------------------------
544ac421f39SYohann template <int NCOMP, int P1d, int Q1d>
545ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
546ac421f39SYohann   CeedScalar r_B[Q1d];
547ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
548ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
549ab213215SJeremy L Thompson 
550ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
55118d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
552ac421f39SYohann     __syncthreads();
55318d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
554ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
55518d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
556241a4b83SYohann     __syncthreads();
557241a4b83SYohann   }
558241a4b83SYohann }
559241a4b83SYohann 
560ab213215SJeremy L Thompson //------------------------------------------------------------------------------
561ab213215SJeremy L Thompson // 3D transpose tensor contract x
562ab213215SJeremy L Thompson //------------------------------------------------------------------------------
563241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
564ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
565ac421f39SYohann   CeedScalar r_B[Q1d];
566ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
567ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
568ab213215SJeremy L Thompson 
569ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
57018d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
571241a4b83SYohann     __syncthreads();
572241a4b83SYohann     V[k] = 0.0;
57318d499f1SYohann     if (data.tidx < P1d && data.tidy < P1d)
574ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
57518d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
576241a4b83SYohann     __syncthreads();
577241a4b83SYohann   }
578241a4b83SYohann }
579241a4b83SYohann 
580ab213215SJeremy L Thompson //------------------------------------------------------------------------------
581ab213215SJeremy L Thompson // 3D transpose tensor contract add x
582ab213215SJeremy L Thompson //------------------------------------------------------------------------------
583241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
584ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
585ac421f39SYohann   CeedScalar r_B[Q1d];
586ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
587ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
588ab213215SJeremy L Thompson 
589ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
59018d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
591241a4b83SYohann     __syncthreads();
59218d499f1SYohann     if (data.tidx < P1d && data.tidy < P1d)
593ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
59418d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
595241a4b83SYohann     __syncthreads();
596241a4b83SYohann   }
597241a4b83SYohann }
598241a4b83SYohann 
599ab213215SJeremy L Thompson //------------------------------------------------------------------------------
600ab213215SJeremy L Thompson // 3D interpolate to quadrature points
601ab213215SJeremy L Thompson //------------------------------------------------------------------------------
602241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
603ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
60418d499f1SYohann   CeedScalar r_t1[T1d];
60518d499f1SYohann   CeedScalar r_t2[T1d];
606ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
607241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
608241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
609241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
610241a4b83SYohann   }
611241a4b83SYohann }
612241a4b83SYohann 
613ab213215SJeremy L Thompson //------------------------------------------------------------------------------
614ab213215SJeremy L Thompson // 3D interpolate transpose
615ab213215SJeremy L Thompson //------------------------------------------------------------------------------
616241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
617ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
61818d499f1SYohann   CeedScalar r_t1[T1d];
61918d499f1SYohann   CeedScalar r_t2[T1d];
620ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
621241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
622241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
623241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
624241a4b83SYohann   }
625241a4b83SYohann }
626241a4b83SYohann 
627ab213215SJeremy L Thompson //------------------------------------------------------------------------------
628ab213215SJeremy L Thompson // 3D derivatives at quadrature points
629ab213215SJeremy L Thompson //------------------------------------------------------------------------------
630241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
631ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
63218d499f1SYohann   CeedScalar r_t1[T1d];
63318d499f1SYohann   CeedScalar r_t2[T1d];
634ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
635241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
636241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
637241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
638241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
639241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
640241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
641241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
642241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
643241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
644241a4b83SYohann   }
645241a4b83SYohann }
646241a4b83SYohann 
647ab213215SJeremy L Thompson //------------------------------------------------------------------------------
648ab213215SJeremy L Thompson // 3D derivatives transpose
649ab213215SJeremy L Thompson //------------------------------------------------------------------------------
650241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
651ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
65218d499f1SYohann   CeedScalar r_t1[T1d];
65318d499f1SYohann   CeedScalar r_t2[T1d];
654ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
655241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
656241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
657241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
658241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
659241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
660241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
661241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
662241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
663241a4b83SYohann     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
664241a4b83SYohann   }
665241a4b83SYohann }
666241a4b83SYohann 
667ab213215SJeremy L Thompson //------------------------------------------------------------------------------
668ab213215SJeremy L Thompson // 3D collocated derivatives computation
669ab213215SJeremy L Thompson //------------------------------------------------------------------------------
670ac421f39SYohann template <int NCOMP, int Q1d>
671ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
67218d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
673ac421f39SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
67418d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
675ac421f39SYohann       __syncthreads();
676ac421f39SYohann       // X derivative
677ac421f39SYohann       r_V[comp+0*NCOMP] = 0.0;
678ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
67918d499f1SYohann         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
680ac421f39SYohann       // Y derivative
681ac421f39SYohann       r_V[comp+1*NCOMP] = 0.0;
682ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
68318d499f1SYohann         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
684ac421f39SYohann       // Z derivative
685ac421f39SYohann       r_V[comp+2*NCOMP] = 0.0;
686ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
687ab213215SJeremy L Thompson         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
688ac421f39SYohann       __syncthreads();
689ac421f39SYohann     }
690ac421f39SYohann   }
69118d499f1SYohann }
692ac421f39SYohann 
693ab213215SJeremy L Thompson //------------------------------------------------------------------------------
694ab213215SJeremy L Thompson // 3D collocated derivatives transpose
695ab213215SJeremy L Thompson //------------------------------------------------------------------------------
696ac421f39SYohann template <int NCOMP, int Q1d>
697ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
69818d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
699ac421f39SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
700ac421f39SYohann       // X derivative
70118d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
702ac421f39SYohann       __syncthreads();
703ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
70418d499f1SYohann         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
705ac421f39SYohann       __syncthreads();
706ac421f39SYohann       // Y derivative
70718d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
708ac421f39SYohann       __syncthreads();
709ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
71018d499f1SYohann         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
711ac421f39SYohann       __syncthreads();
712ac421f39SYohann       // Z derivative
713ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
714ac421f39SYohann         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
715ac421f39SYohann     }
716ac421f39SYohann   }
71718d499f1SYohann }
718ac421f39SYohann 
719ab213215SJeremy L Thompson //------------------------------------------------------------------------------
720ab213215SJeremy L Thompson // 1D quadrature weights
721ab213215SJeremy L Thompson //------------------------------------------------------------------------------
722241a4b83SYohann template <int Q1d>
723241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
72418d499f1SYohann   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
725241a4b83SYohann }
726241a4b83SYohann 
727ab213215SJeremy L Thompson //------------------------------------------------------------------------------
728ab213215SJeremy L Thompson // 2D quadrature weights
729ab213215SJeremy L Thompson //------------------------------------------------------------------------------
730241a4b83SYohann template <int Q1d>
731241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
73218d499f1SYohann   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
73318d499f1SYohann         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
734241a4b83SYohann }
735241a4b83SYohann 
736ab213215SJeremy L Thompson //------------------------------------------------------------------------------
737ab213215SJeremy L Thompson // 3D quadrature weights
738ab213215SJeremy L Thompson //------------------------------------------------------------------------------
739241a4b83SYohann template <int Q1d>
740241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
74118d499f1SYohann   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
74218d499f1SYohann   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
743ac421f39SYohann   for (CeedInt z = 0; z < Q1d; ++z)
74418d499f1SYohann     w[z] = quad ? pw*qweight1d[z] : 0.0;
745241a4b83SYohann }
746241a4b83SYohann 
747241a4b83SYohann );
748ab213215SJeremy L Thompson //------------------------------------------------------------------------------
749ab213215SJeremy L Thompson // Build singe operator kernel
750ab213215SJeremy L Thompson //------------------------------------------------------------------------------
751241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
752241a4b83SYohann 
753241a4b83SYohann   using std::ostringstream;
754241a4b83SYohann   using std::string;
755241a4b83SYohann   int ierr;
756241a4b83SYohann   bool setupdone;
757*e15f9bd0SJeremy L Thompson   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
758*e15f9bd0SJeremy L Thompson   if (setupdone) return CEED_ERROR_SUCCESS;
759241a4b83SYohann   Ceed ceed;
760*e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
761241a4b83SYohann   CeedOperator_Cuda_gen *data;
762*e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
763241a4b83SYohann   CeedQFunction qf;
764241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
765*e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
766*e15f9bd0SJeremy L Thompson   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
7671da99368SJeremy L Thompson   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
7685c7b696cSjeremylt           numoutputfields, ncomp, dim = 0, lsize;
769*e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
770*e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
771241a4b83SYohann   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
772*e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
773241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
774241a4b83SYohann   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
775*e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
776241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
777241a4b83SYohann   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
778*e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
779241a4b83SYohann   CeedEvalMode emode;
780241a4b83SYohann   CeedBasis basis;
781241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
782241a4b83SYohann   CeedElemRestriction Erestrict;
783461525f5SNatalie Beams   CeedElemRestriction_Cuda *restr_data;
784241a4b83SYohann 
785241a4b83SYohann   ostringstream code;
786241a4b83SYohann   string devFunctions(deviceFunctions);
787241a4b83SYohann 
788f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
789f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
790f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
791*e15f9bd0SJeremy L Thompson   ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr);
792f1a13f77SYohann Dudouit   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
793f1a13f77SYohann Dudouit   if (prop.major<6){
794f1a13f77SYohann Dudouit     code << atomicAdd;
795f1a13f77SYohann Dudouit   }
796f1a13f77SYohann Dudouit 
797241a4b83SYohann   code << devFunctions;
798241a4b83SYohann 
799241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
800c3c443faSYohann Dudouit   string qFunctionName(qf_data->qFunctionName);
801c3c443faSYohann Dudouit   string oper;
80214cce66cSYohann Dudouit   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
8034d537eeaSYohann 
8044d537eeaSYohann   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
805*e15f9bd0SJeremy L Thompson   code << "#define CeedPragmaSIMD\n";
806*e15f9bd0SJeremy L Thompson   code << "#define CEED_ERROR_SUCCESS 0\n\n";
8071da99368SJeremy L Thompson 
8081da99368SJeremy L Thompson   // Find dim and Q1d
80964d3f0c0Sjeremylt   bool useCollograd = true;
81018d499f1SYohann   data->maxP1d = 0;
8111da99368SJeremy L Thompson   for (CeedInt i = 0; i < numinputfields; i++) {
812*e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
8131da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
814*e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8150f54b25eSjeremylt       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
816*e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8171da99368SJeremy L Thompson 
8181da99368SJeremy L Thompson       // Check for collocated gradient
81918d499f1SYohann       useCollograd = useCollograd && basis_data->d_collograd1d;
8201da99368SJeremy L Thompson 
8211da99368SJeremy L Thompson       // Collect dim and Q1d
822*e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8231da99368SJeremy L Thompson       bool isTensor;
824*e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8251da99368SJeremy L Thompson       if (isTensor) {
826*e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
827*e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
82818d499f1SYohann         if (P1d>data->maxP1d) data->maxP1d = P1d;
8291da99368SJeremy L Thompson       } else {
830e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
831*e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
832e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8331da99368SJeremy L Thompson         }
8341da99368SJeremy L Thompson     }
8351da99368SJeremy L Thompson   }
8361da99368SJeremy L Thompson   // Check output bases for Q1d, dim as well
8371da99368SJeremy L Thompson   //   The only imput basis might be CEED_BASIS_COLLOCATED
8381da99368SJeremy L Thompson   for (CeedInt i = 0; i < numoutputfields; i++) {
839*e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
8400f54b25eSjeremylt 
8411da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
842*e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8430f54b25eSjeremylt       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
844*e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8450f54b25eSjeremylt 
8461da99368SJeremy L Thompson       // Collect dim and Q1d
847*e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8481da99368SJeremy L Thompson       bool isTensor;
849*e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8501da99368SJeremy L Thompson       if (isTensor) {
851*e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
8521da99368SJeremy L Thompson       } else {
853e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
854*e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
855e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8561da99368SJeremy L Thompson         }
8570f54b25eSjeremylt 
8580f54b25eSjeremylt       // Check for collocated gradient
85964d3f0c0Sjeremylt       useCollograd = useCollograd && basis_data->d_collograd1d;
8601da99368SJeremy L Thompson     }
8611da99368SJeremy L Thompson   }
8621da99368SJeremy L Thompson   data->dim = dim;
8631da99368SJeremy L Thompson   data->Q1d = Q1d;
8641da99368SJeremy L Thompson 
8651da99368SJeremy L Thompson   // Define CEED_Q_VLA
86664d3f0c0Sjeremylt   if (dim != 3 || useCollograd) {
8671da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA 1\n\n";
8681da99368SJeremy L Thompson   } else {
8691da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
8701da99368SJeremy L Thompson   }
8711da99368SJeremy L Thompson 
872241a4b83SYohann   code << qFunction;
873241a4b83SYohann 
874241a4b83SYohann   // Setup
875818e0025SJeremy L Thompson   code << "\n// -----------------------------------------------------------------------------\n";
876c3c443faSYohann Dudouit   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
877241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
878241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
879*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8801da99368SJeremy L Thompson     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
881241a4b83SYohann       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
882241a4b83SYohann     }
883241a4b83SYohann   }
884241a4b83SYohann 
885241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
886241a4b83SYohann     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
887241a4b83SYohann   }
8881da99368SJeremy L Thompson 
889241a4b83SYohann   code << "  const CeedInt Dim = "<<dim<<";\n";
890241a4b83SYohann   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
8911da99368SJeremy L Thompson 
892241a4b83SYohann   code << "  extern __shared__ CeedScalar slice[];\n";
893241a4b83SYohann   code << "  BackendData data;\n";
894241a4b83SYohann   code << "  data.tidx = threadIdx.x;\n";
895241a4b83SYohann   code << "  data.tidy = threadIdx.y;\n";
896241a4b83SYohann   code << "  data.tidz = threadIdx.z;\n";
897241a4b83SYohann   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
89818d499f1SYohann   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
899920dcdc4Sjeremylt 
900818e0025SJeremy L Thompson   code << "\n  // -- Input field constants and basis data --\n";
901ac421f39SYohann   //Initialize constants, and matrices B and G
902241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
903818e0025SJeremy L Thompson     code << "  // ---- Input field "<<i<<" ----\n";
904241a4b83SYohann     // Get elemsize, emode, ncomp
905241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
906*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
907241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
908*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
909241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
910*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9114d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
912*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
913920dcdc4Sjeremylt 
914920dcdc4Sjeremylt     // Set field constants
915920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT) {
916*e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
917920dcdc4Sjeremylt       if (basis != CEED_BASIS_COLLOCATED) {
918*e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
919920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
920920dcdc4Sjeremylt       } else {
921920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
922920dcdc4Sjeremylt       }
923920dcdc4Sjeremylt       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
924920dcdc4Sjeremylt     }
925920dcdc4Sjeremylt 
926920dcdc4Sjeremylt     // Load basis data
927920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
928241a4b83SYohann     switch (emode) {
929241a4b83SYohann     case CEED_EVAL_NONE:
930241a4b83SYohann       break;
931241a4b83SYohann     case CEED_EVAL_INTERP:
932*e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
933241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
934241a4b83SYohann       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
935241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
936241a4b83SYohann       break;
937241a4b83SYohann     case CEED_EVAL_GRAD:
938*e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
939241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
940241a4b83SYohann       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
941241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
94264d3f0c0Sjeremylt       if (useCollograd) {
943ac421f39SYohann         data->G.in[i] = basis_data->d_collograd1d;
944ac421f39SYohann         code << "  __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
945ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
946ac421f39SYohann       } else {
947ac421f39SYohann         data->G.in[i] = basis_data->d_grad1d;
948ac421f39SYohann         code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
949241a4b83SYohann         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
950ac421f39SYohann       }
951241a4b83SYohann       break;
952241a4b83SYohann     case CEED_EVAL_WEIGHT:
953241a4b83SYohann       break; // No action
954241a4b83SYohann     case CEED_EVAL_DIV:
955241a4b83SYohann       break; // TODO: Not implemented
956241a4b83SYohann     case CEED_EVAL_CURL:
957241a4b83SYohann       break; // TODO: Not implemented
958241a4b83SYohann     }
959241a4b83SYohann   }
960920dcdc4Sjeremylt 
961818e0025SJeremy L Thompson   code << "\n  // -- Output field constants and basis data --\n";
962241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
963818e0025SJeremy L Thompson     code << "  // ---- Output field "<<i<<" ----\n";
964241a4b83SYohann     // Get elemsize, emode, ncomp
965241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
966*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
967241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
968*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
969241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
970*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9714d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
972*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
973920dcdc4Sjeremylt 
974920dcdc4Sjeremylt     // Set field constants
975*e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
976920dcdc4Sjeremylt     if (basis != CEED_BASIS_COLLOCATED) {
977*e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
978241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
979920dcdc4Sjeremylt     } else {
980920dcdc4Sjeremylt       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
981920dcdc4Sjeremylt     }
982920dcdc4Sjeremylt     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
983920dcdc4Sjeremylt 
984920dcdc4Sjeremylt     // Load basis data
985920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
986920dcdc4Sjeremylt     switch (emode) {
987920dcdc4Sjeremylt     case CEED_EVAL_NONE:
988920dcdc4Sjeremylt       break; // No action
989920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
990*e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
991241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
992241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
993241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
994241a4b83SYohann       break;
995241a4b83SYohann     case CEED_EVAL_GRAD:
996*e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
997241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
998241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
999241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
100064d3f0c0Sjeremylt       if (useCollograd) {
1001ac421f39SYohann         data->G.out[i] = basis_data->d_collograd1d;
1002ac421f39SYohann         code << "  __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1003ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1004ac421f39SYohann       } else {
1005ac421f39SYohann         data->G.out[i] = basis_data->d_grad1d;
1006ac421f39SYohann         code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1007241a4b83SYohann         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1008ac421f39SYohann       }
1009241a4b83SYohann       break;
1010e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
1011241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1012241a4b83SYohann       Ceed ceed;
1013*e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1014*e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
1015241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1016241a4b83SYohann       break; // Should not occur
1017241a4b83SYohann     }
1018241a4b83SYohann     case CEED_EVAL_DIV:
1019241a4b83SYohann       break; // TODO: Not implemented
1020241a4b83SYohann     case CEED_EVAL_CURL:
1021241a4b83SYohann       break; // TODO: Not implemented
1022e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
1023241a4b83SYohann     }
1024241a4b83SYohann   }
1025818e0025SJeremy L Thompson   code << "\n  // -- Element loop --\n";
1026ac421f39SYohann   code << "  __syncthreads();\n";
1027241a4b83SYohann   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1028241a4b83SYohann   // Input basis apply if needed
1029ac421f39SYohann   // Generate the correct eval mode code for each input
1030818e0025SJeremy L Thompson   code << "    // -- Input field restrictions and basis actions --\n";
1031241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1032818e0025SJeremy L Thompson     code << "    // ---- Input field "<<i<<" ----\n";
1033241a4b83SYohann     // Get elemsize, emode, ncomp
1034241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1035*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1036241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1037*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1038241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1039*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10404d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1041*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1042920dcdc4Sjeremylt 
1043920dcdc4Sjeremylt     // Restriction
1044920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT &&
104564d3f0c0Sjeremylt         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1046241a4b83SYohann       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
10479a2291e3SJeremy L Thompson 
10489a2291e3SJeremy L Thompson       bool isStrided;
1049*e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
10509a2291e3SJeremy L Thompson       if (!isStrided) {
10515c7b696cSjeremylt         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1052*e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10535c7b696cSjeremylt         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
10545c7b696cSjeremylt         CeedInt compstride;
1055*e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
10565c7b696cSjeremylt         code << "    // CompStride: "<<compstride<<"\n";
1057*e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
10589a2291e3SJeremy L Thompson         data->indices.in[i] = restr_data->d_ind;
10595c7b696cSjeremylt         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1060ccedf6b0Sjeremylt       } else {
1061920dcdc4Sjeremylt         bool backendstrides;
1062fd364f38SJeremy L Thompson         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1063*e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
106413585805SJeremy L Thompson         CeedInt nelem;
106513585805SJeremy L Thompson         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1066*e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
106713585805SJeremy L Thompson         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1068920dcdc4Sjeremylt         if (!backendstrides) {
1069920dcdc4Sjeremylt           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1070*e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
1071ccedf6b0Sjeremylt         }
1072920dcdc4Sjeremylt         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1073d80fc06aSjeremylt         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1074920dcdc4Sjeremylt       }
1075920dcdc4Sjeremylt     }
1076920dcdc4Sjeremylt 
1077920dcdc4Sjeremylt     // Basis action
1078920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1079920dcdc4Sjeremylt     switch (emode) {
1080920dcdc4Sjeremylt     case CEED_EVAL_NONE:
108164d3f0c0Sjeremylt       if (!useCollograd) {
1082920dcdc4Sjeremylt         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1083920dcdc4Sjeremylt       }
1084920dcdc4Sjeremylt       break;
1085920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
1086241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1087241a4b83SYohann       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1088241a4b83SYohann       break;
1089241a4b83SYohann     case CEED_EVAL_GRAD:
109064d3f0c0Sjeremylt       if (useCollograd) {
1091ac421f39SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1092ac421f39SYohann         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1093ac421f39SYohann       } else {
1094241a4b83SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1095241a4b83SYohann         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1096ac421f39SYohann       }
1097241a4b83SYohann       break;
1098241a4b83SYohann     case CEED_EVAL_WEIGHT:
1099241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1100*e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1101*e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1102241a4b83SYohann       data->W = basis_data->d_qweight1d;
1103241a4b83SYohann       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1104241a4b83SYohann       break; // No action
1105241a4b83SYohann     case CEED_EVAL_DIV:
1106241a4b83SYohann       break; // TODO: Not implemented
1107241a4b83SYohann     case CEED_EVAL_CURL:
1108241a4b83SYohann       break; // TODO: Not implemented
1109241a4b83SYohann     }
1110241a4b83SYohann   }
1111ac421f39SYohann 
1112241a4b83SYohann   // Q function
1113818e0025SJeremy L Thompson   code << "\n    // -- Output field setup --\n";
1114241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1115818e0025SJeremy L Thompson       code << "\n    // ---- Output field "<<i<<" ----\n";
1116241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1117*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1118241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
1119241a4b83SYohann     {
112064d3f0c0Sjeremylt       if (useCollograd) {
1121ac421f39SYohann         //Accumulator for gradient slices
1122ac421f39SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1123ac421f39SYohann         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1124ac421f39SYohann         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1125ac421f39SYohann         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1126ac421f39SYohann         code << "      }\n";
1127ac421f39SYohann         code << "    }\n";
1128ac421f39SYohann       } else {
1129241a4b83SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1130241a4b83SYohann       }
1131ac421f39SYohann     }
1132241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1133241a4b83SYohann     {
1134241a4b83SYohann       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1135241a4b83SYohann     }
1136241a4b83SYohann   }
1137ac421f39SYohann   // We treat quadrature points per slice in 3d to save registers
113864d3f0c0Sjeremylt   if (useCollograd) {
1139920dcdc4Sjeremylt     code << "\n    // Note: Collocated Gradient\n";
1140ac421f39SYohann     code << "#pragma unroll\n";
1141ac421f39SYohann     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1142818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1143ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1144818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1145ac421f39SYohann       // Get elemsize, emode, ncomp
1146ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1147*e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1148ac421f39SYohann       // Basis action
1149920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1150ac421f39SYohann       switch (emode) {
1151ac421f39SYohann       case CEED_EVAL_NONE:
1152ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11539a2291e3SJeremy L Thompson 
11549a2291e3SJeremy L Thompson         bool isStrided;
1155*e15f9bd0SJeremy L Thompson         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1156*e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1157*e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
11589a2291e3SJeremy L Thompson         if (!isStrided) {
11595c7b696cSjeremylt           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1160*e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11615c7b696cSjeremylt           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
11625c7b696cSjeremylt           CeedInt compstride;
1163*e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
11645c7b696cSjeremylt           code << "      // CompStride: "<<compstride<<"\n";
1165*e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
11669a2291e3SJeremy L Thompson           data->indices.in[i] = restr_data->d_ind;
11675c7b696cSjeremylt           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1168920dcdc4Sjeremylt         } else {
1169920dcdc4Sjeremylt           bool backendstrides;
1170fd364f38SJeremy L Thompson           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1171*e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
117213585805SJeremy L Thompson           CeedInt nelem;
117313585805SJeremy L Thompson           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1174*e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
117513585805SJeremy L Thompson           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1176920dcdc4Sjeremylt           if (!backendstrides) {
1177920dcdc4Sjeremylt             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1178*e15f9bd0SJeremy L Thompson             CeedChkBackend(ierr);
1179920dcdc4Sjeremylt           }
1180920dcdc4Sjeremylt           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1181d80fc06aSjeremylt           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1182920dcdc4Sjeremylt         }
1183ac421f39SYohann         break;
1184ac421f39SYohann       case CEED_EVAL_INTERP:
1185ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1186ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1187ac421f39SYohann         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1188ac421f39SYohann         code << "      }\n";
1189ac421f39SYohann         break;
1190ac421f39SYohann       case CEED_EVAL_GRAD:
1191ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1192ac421f39SYohann         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1193ac421f39SYohann         break;
1194ac421f39SYohann       case CEED_EVAL_WEIGHT:
1195ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[1];\n";
1196ac421f39SYohann         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1197ac421f39SYohann         break; // No action
1198ac421f39SYohann       case CEED_EVAL_DIV:
1199ac421f39SYohann         break; // TODO: Not implemented
1200ac421f39SYohann       case CEED_EVAL_CURL:
1201ac421f39SYohann         break; // TODO: Not implemented
1202ac421f39SYohann       }
1203ac421f39SYohann     }
1204818e0025SJeremy L Thompson     code << "\n      // -- Output fields --\n";
1205ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1206818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1207ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1208*e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1209ac421f39SYohann       // Basis action
1210ac421f39SYohann       switch (emode) {
1211ac421f39SYohann       case CEED_EVAL_NONE:
1212ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1213ac421f39SYohann         break; // No action
1214ac421f39SYohann       case CEED_EVAL_INTERP:
1215ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1216ac421f39SYohann         break;
1217ac421f39SYohann       case CEED_EVAL_GRAD:
1218ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1219ac421f39SYohann         break;
1220ac421f39SYohann       case CEED_EVAL_WEIGHT:
1221ac421f39SYohann         break; // Should not occur
1222ac421f39SYohann       case CEED_EVAL_DIV:
1223ac421f39SYohann         break; // TODO: Not implemented
1224ac421f39SYohann       case CEED_EVAL_CURL:
1225ac421f39SYohann         break; // TODO: Not implemented
1226ac421f39SYohann       }
1227ac421f39SYohann     }
1228ac421f39SYohann   } else {
1229920dcdc4Sjeremylt     code << "\n      // Note: No Collocated Gradient\n";
1230818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1231ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1232818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1233ac421f39SYohann       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1234ac421f39SYohann     }
1235818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1236ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1237818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1238ac421f39SYohann       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1239ac421f39SYohann     }
1240ac421f39SYohann   }
1241818e0025SJeremy L Thompson   code << "\n      // -- QFunction Inputs and outputs --\n";
12424d537eeaSYohann   code << "      CeedScalar* in["<<numinputfields<<"];\n";
12434d537eeaSYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1244818e0025SJeremy L Thompson     code << "      // ---- Input field "<<i<<" ----\n";
1245ac421f39SYohann     code << "      in["<<i<<"] = r_q"<<i<<";\n";
12464d537eeaSYohann   }
12474d537eeaSYohann   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
12484d537eeaSYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1249818e0025SJeremy L Thompson     code << "      // ---- Output field "<<i<<" ----\n";
1250ac421f39SYohann     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
12514d537eeaSYohann   }
1252818e0025SJeremy L Thompson   code << "\n      // -- Apply QFunction --\n";
1253ac421f39SYohann   code << "      "<<qFunctionName<<"(ctx, ";
125464d3f0c0Sjeremylt   if (dim != 3 || useCollograd) {
1255ac421f39SYohann     code << "1";
1256ac421f39SYohann   } else {
1257ac421f39SYohann     code << "Q1d";
1258ac421f39SYohann   }
1259ac421f39SYohann   code << ", in, out);\n";
126064d3f0c0Sjeremylt   if (useCollograd) {
1261920dcdc4Sjeremylt     code << "\n      // Note: Collocated Gradient\n";
1262818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1263ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1264818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1265ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1266*e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1267ac421f39SYohann       // Basis action
1268920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1269ac421f39SYohann       switch (emode) {
1270ac421f39SYohann       case CEED_EVAL_NONE:
1271ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1272ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1273ac421f39SYohann         code << "      }\n";
1274ac421f39SYohann         break; // No action
1275ac421f39SYohann       case CEED_EVAL_INTERP:
1276ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1277ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1278ac421f39SYohann         code << "      }\n";
1279ac421f39SYohann         break;
1280ac421f39SYohann       case CEED_EVAL_GRAD:
1281ac421f39SYohann         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1282ac421f39SYohann         break;
1283ac421f39SYohann       case CEED_EVAL_WEIGHT:
1284ac421f39SYohann         break; // Should not occur
1285ac421f39SYohann       case CEED_EVAL_DIV:
1286ac421f39SYohann         break; // TODO: Not implemented
1287ac421f39SYohann       case CEED_EVAL_CURL:
1288ac421f39SYohann         break; // TODO: Not implemented
1289ac421f39SYohann       }
1290ac421f39SYohann     }
1291ac421f39SYohann     code << "    }\n";
1292ac421f39SYohann   }
1293241a4b83SYohann 
1294241a4b83SYohann   // Output basis apply if needed
1295ac421f39SYohann   // Generate the correct eval mode code for each output
1296818e0025SJeremy L Thompson   code << "\n    // -- Output field basis action and restrictions --\n";
1297241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1298818e0025SJeremy L Thompson     code << "    // ---- Output field "<<i<<" ----\n";
1299241a4b83SYohann     // Get elemsize, emode, ncomp
1300241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1301*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1302241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1303*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1304241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1305*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13064d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1307*e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1308241a4b83SYohann     // Basis action
1309920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1310241a4b83SYohann     switch (emode) {
1311241a4b83SYohann     case CEED_EVAL_NONE:
1312920dcdc4Sjeremylt       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1313241a4b83SYohann       break; // No action
1314241a4b83SYohann     case CEED_EVAL_INTERP:
1315241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1316241a4b83SYohann       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1317241a4b83SYohann       break;
1318241a4b83SYohann     case CEED_EVAL_GRAD:
1319241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
132064d3f0c0Sjeremylt       if (useCollograd) {
1321ac421f39SYohann         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1322ac421f39SYohann       } else {
1323241a4b83SYohann         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1324ac421f39SYohann       }
1325241a4b83SYohann       break;
1326e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
1327241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1328241a4b83SYohann       Ceed ceed;
1329*e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1330*e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
1331241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1332241a4b83SYohann       break; // Should not occur
1333241a4b83SYohann     }
1334241a4b83SYohann     case CEED_EVAL_DIV:
1335241a4b83SYohann       break; // TODO: Not implemented
1336241a4b83SYohann     case CEED_EVAL_CURL:
1337241a4b83SYohann       break; // TODO: Not implemented
1338e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
1339241a4b83SYohann     }
1340920dcdc4Sjeremylt     // Restriction
13419a2291e3SJeremy L Thompson       bool isStrided;
1342*e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
13439a2291e3SJeremy L Thompson     if (!isStrided) {
13445c7b696cSjeremylt       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1345*e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13465c7b696cSjeremylt       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
13475c7b696cSjeremylt       CeedInt compstride;
1348*e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
13495c7b696cSjeremylt       code << "    // CompStride: "<<compstride<<"\n";
1350*e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
13519a2291e3SJeremy L Thompson       data->indices.out[i] = restr_data->d_ind;
13525c7b696cSjeremylt       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1353920dcdc4Sjeremylt     } else {
1354920dcdc4Sjeremylt       bool backendstrides;
1355fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1356*e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
135713585805SJeremy L Thompson       CeedInt nelem;
135813585805SJeremy L Thompson       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1359*e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
136013585805SJeremy L Thompson       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1361920dcdc4Sjeremylt       if (!backendstrides) {
1362920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1363*e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
1364920dcdc4Sjeremylt       }
1365920dcdc4Sjeremylt       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1366d80fc06aSjeremylt       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1367920dcdc4Sjeremylt     }
1368241a4b83SYohann   }
1369241a4b83SYohann 
1370241a4b83SYohann   code << "  }\n";
1371818e0025SJeremy L Thompson   code << "}\n";
1372818e0025SJeremy L Thompson   code << "// -----------------------------------------------------------------------------\n\n";
1373241a4b83SYohann 
1374ab213215SJeremy L Thompson   // View kernel for debugging
1375b8e71988SJeremy L Thompson   CeedDebug(code.str().c_str());
1376241a4b83SYohann 
137718d499f1SYohann   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
137818d499f1SYohann                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1379*e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
1380c3c443faSYohann Dudouit   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1381*e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
1382241a4b83SYohann 
1383*e15f9bd0SJeremy L Thompson   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1384*e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
1385241a4b83SYohann }
1386ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1387