xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 46dc07349c44057b9efae59cd6a2d41f419237bd)
1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4241a4b83SYohann //
5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7241a4b83SYohann // element discretizations for exascale applications. For more information and
8241a4b83SYohann // source code availability see http://github.com/ceed.
9241a4b83SYohann //
10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
163d576824SJeremy L Thompson 
17b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12
187df94212SJeremy L Thompson 
19ec3da8bcSJed Brown #include <ceed/ceed.h>
20ec3da8bcSJed Brown #include <ceed/backend.h>
213d576824SJeremy L Thompson #include <cuda_runtime.h>
22241a4b83SYohann #include <iostream>
23241a4b83SYohann #include <sstream>
243d576824SJeremy L Thompson #include "ceed-cuda-gen.h"
256d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h"
260d0321e0SJeremy L Thompson #include "../cuda-ref/ceed-cuda-ref.h"
27241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
28241a4b83SYohann 
29f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
30ab213215SJeremy L Thompson //------------------------------------------------------------------------------
31ab213215SJeremy L Thompson // Atomic add, for older CUDA
32ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3380a9ef05SNatalie Beams __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) {
34241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
35241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
36241a4b83SYohann   do {
37241a4b83SYohann     assumed = old;
38241a4b83SYohann     old =
39241a4b83SYohann       atomicCAS(address_as_ull, assumed,
40241a4b83SYohann                 __double_as_longlong(val +
41241a4b83SYohann                                     __longlong_as_double(assumed)));
42241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
43241a4b83SYohann     // (since NaN != NaN)
44241a4b83SYohann   } while (assumed != old);
45241a4b83SYohann   return __longlong_as_double(old);
46241a4b83SYohann }
47f1a13f77SYohann Dudouit );
48f1a13f77SYohann Dudouit 
49f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
50f1a13f77SYohann Dudouit 
51ab213215SJeremy L Thompson //------------------------------------------------------------------------------
52ab213215SJeremy L Thompson // Typedefs
53ab213215SJeremy L Thompson //------------------------------------------------------------------------------
54f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
55f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
56f1a13f77SYohann Dudouit 
57f1a13f77SYohann Dudouit typedef struct {
58f1a13f77SYohann Dudouit   CeedInt tidx;
59f1a13f77SYohann Dudouit   CeedInt tidy;
60f1a13f77SYohann Dudouit   CeedInt tidz;
61f1a13f77SYohann Dudouit   CeedInt tid;
62f1a13f77SYohann Dudouit   CeedScalar* slice;
63f1a13f77SYohann Dudouit } BackendData;
64241a4b83SYohann 
65ab213215SJeremy L Thompson //------------------------------------------------------------------------------
66ab213215SJeremy L Thompson // Load matrices for basis actions
67ab213215SJeremy L Thompson //------------------------------------------------------------------------------
68241a4b83SYohann template <int P, int Q>
69053543deSYohann Dudouit inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
70ab213215SJeremy L Thompson   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
71241a4b83SYohann     B[i] = d_B[i];
72241a4b83SYohann }
73241a4b83SYohann 
74ab213215SJeremy L Thompson //------------------------------------------------------------------------------
75241a4b83SYohann // 1D
76ab213215SJeremy L Thompson //------------------------------------------------------------------------------
77ab213215SJeremy L Thompson 
78ab213215SJeremy L Thompson //------------------------------------------------------------------------------
79ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
80ab213215SJeremy L Thompson //------------------------------------------------------------------------------
815c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
82053543deSYohann Dudouit inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
83ab213215SJeremy L Thompson   if (data.tidx < P1d) {
844d537eeaSYohann     const CeedInt node = data.tidx;
85ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
86ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
875c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
88241a4b83SYohann   }
89241a4b83SYohann }
90920dcdc4Sjeremylt 
91ab213215SJeremy L Thompson //------------------------------------------------------------------------------
92ab213215SJeremy L Thompson // L-vector -> E-vector, strided
93ab213215SJeremy L Thompson //------------------------------------------------------------------------------
94d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
95053543deSYohann Dudouit inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
96ab213215SJeremy L Thompson   if (data.tidx < P1d) {
97ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
98d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
99ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
100d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
101ccedf6b0Sjeremylt   }
102ccedf6b0Sjeremylt }
103241a4b83SYohann 
104ab213215SJeremy L Thompson //------------------------------------------------------------------------------
105ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
106ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1075c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
108053543deSYohann Dudouit inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
109ab213215SJeremy L Thompson   if (data.tidx < P1d) {
1104d537eeaSYohann     const CeedInt node = data.tidx;
111ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
112ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1135c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
114241a4b83SYohann   }
115241a4b83SYohann }
116241a4b83SYohann 
117ab213215SJeremy L Thompson //------------------------------------------------------------------------------
118ab213215SJeremy L Thompson // E-vector -> L-vector, strided
119ab213215SJeremy L Thompson //------------------------------------------------------------------------------
120d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
121d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
122ab213215SJeremy L Thompson   if (data.tidx < P1d) {
123ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
124d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
125ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
126d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
127ccedf6b0Sjeremylt   }
128ccedf6b0Sjeremylt }
129ccedf6b0Sjeremylt 
130ab213215SJeremy L Thompson //------------------------------------------------------------------------------
131ab213215SJeremy L Thompson // 1D tensor contraction x
132ab213215SJeremy L Thompson //------------------------------------------------------------------------------
133241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
134ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
135241a4b83SYohann   data.slice[data.tidx] = *U;
136241a4b83SYohann   __syncthreads();
137241a4b83SYohann   *V = 0.0;
13818d499f1SYohann   if (data.tidx < Q1d)
139ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
140ab213215SJeremy L Thompson       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
141241a4b83SYohann   __syncthreads();
142241a4b83SYohann }
143241a4b83SYohann 
144ab213215SJeremy L Thompson //------------------------------------------------------------------------------
145ab213215SJeremy L Thompson // 1D transpose tensor contraction x
146ab213215SJeremy L Thompson //------------------------------------------------------------------------------
147241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
148ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
149241a4b83SYohann   data.slice[data.tidx] = *U;
150241a4b83SYohann   __syncthreads();
151241a4b83SYohann   *V = 0.0;
15218d499f1SYohann   if (data.tidx < P1d)
153ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
154ab213215SJeremy L Thompson       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
155241a4b83SYohann   __syncthreads();
156241a4b83SYohann }
157241a4b83SYohann 
158ab213215SJeremy L Thompson //------------------------------------------------------------------------------
159ab213215SJeremy L Thompson // 1D interpolate to quadrature points
160ab213215SJeremy L Thompson //------------------------------------------------------------------------------
161241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
162ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
163ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
164241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
165241a4b83SYohann }
166241a4b83SYohann 
167ab213215SJeremy L Thompson //------------------------------------------------------------------------------
168ab213215SJeremy L Thompson // 1D interpolate transpose
169ab213215SJeremy L Thompson //------------------------------------------------------------------------------
170241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
171ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
172ab213215SJeremy L Thompson   for (CeedInt comp=0; comp<NCOMP; comp++)
173241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
174241a4b83SYohann }
175241a4b83SYohann 
176ab213215SJeremy L Thompson //------------------------------------------------------------------------------
177ab213215SJeremy L Thompson // 1D derivatives at quadrature points
178ab213215SJeremy L Thompson //------------------------------------------------------------------------------
179241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
180ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
181ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
182241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
183241a4b83SYohann }
184241a4b83SYohann 
185ab213215SJeremy L Thompson //------------------------------------------------------------------------------
186ab213215SJeremy L Thompson // 1D derivatives transpose
187ab213215SJeremy L Thompson //------------------------------------------------------------------------------
188241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
189ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
190ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
191241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
192241a4b83SYohann }
193241a4b83SYohann 
194ab213215SJeremy L Thompson //------------------------------------------------------------------------------
195241a4b83SYohann // 2D
196ab213215SJeremy L Thompson //------------------------------------------------------------------------------
197ab213215SJeremy L Thompson 
198ab213215SJeremy L Thompson //------------------------------------------------------------------------------
199ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
200ab213215SJeremy L Thompson //------------------------------------------------------------------------------
2015c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
202053543deSYohann Dudouit inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
203ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
2044d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
205ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
206ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2075c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
208241a4b83SYohann   }
209241a4b83SYohann }
210920dcdc4Sjeremylt 
211ab213215SJeremy L Thompson //------------------------------------------------------------------------------
212ab213215SJeremy L Thompson // L-vector -> E-vector, strided
213ab213215SJeremy L Thompson //------------------------------------------------------------------------------
214d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
215053543deSYohann Dudouit inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
216ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
217ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
218d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
219ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
220d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
221ccedf6b0Sjeremylt   }
222ccedf6b0Sjeremylt }
223241a4b83SYohann 
224ab213215SJeremy L Thompson //------------------------------------------------------------------------------
225ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
226ab213215SJeremy L Thompson //------------------------------------------------------------------------------
2275c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
228053543deSYohann Dudouit inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
229ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
2304d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
231ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
232ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2335c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
234241a4b83SYohann   }
235241a4b83SYohann }
236241a4b83SYohann 
237ab213215SJeremy L Thompson //------------------------------------------------------------------------------
238ab213215SJeremy L Thompson // E-vector -> L-vector, strided
239ab213215SJeremy L Thompson //------------------------------------------------------------------------------
240d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
241d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
242ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
243ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
244d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
245ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
246d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
247ccedf6b0Sjeremylt   }
248ccedf6b0Sjeremylt }
249ccedf6b0Sjeremylt 
250ab213215SJeremy L Thompson //------------------------------------------------------------------------------
251ab213215SJeremy L Thompson // 2D tensor contraction x
252ab213215SJeremy L Thompson //------------------------------------------------------------------------------
253241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
254ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
25518d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
256241a4b83SYohann   __syncthreads();
257241a4b83SYohann   *V = 0.0;
25818d499f1SYohann   if (data.tidx < Q1d && data.tidy < P1d)
259ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
26018d499f1SYohann       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
261241a4b83SYohann   __syncthreads();
262241a4b83SYohann }
263241a4b83SYohann 
264ab213215SJeremy L Thompson //------------------------------------------------------------------------------
265ab213215SJeremy L Thompson // 2D tensor contract y
266ab213215SJeremy L Thompson //------------------------------------------------------------------------------
267241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
268ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
26918d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
270241a4b83SYohann   __syncthreads();
271241a4b83SYohann   *V = 0.0;
27218d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d)
273ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
27418d499f1SYohann       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
275241a4b83SYohann   __syncthreads();
276241a4b83SYohann }
277241a4b83SYohann 
278ab213215SJeremy L Thompson //------------------------------------------------------------------------------
279ab213215SJeremy L Thompson // 2D transpose tensor contract y
280ab213215SJeremy L Thompson //------------------------------------------------------------------------------
281241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
282ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
28318d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
284241a4b83SYohann   __syncthreads();
285241a4b83SYohann   *V = 0.0;
28618d499f1SYohann   if (data.tidx < Q1d && data.tidy < P1d)
287ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
28818d499f1SYohann       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
289241a4b83SYohann   __syncthreads();
290241a4b83SYohann }
291241a4b83SYohann 
292ab213215SJeremy L Thompson //------------------------------------------------------------------------------
293ab213215SJeremy L Thompson // 2D transpose tensor contract x
294ab213215SJeremy L Thompson //------------------------------------------------------------------------------
295241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
296ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
29718d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
298241a4b83SYohann   __syncthreads();
299241a4b83SYohann   *V = 0.0;
30018d499f1SYohann   if (data.tidx < P1d && data.tidy < P1d)
301ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
30218d499f1SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
303241a4b83SYohann   __syncthreads();
304241a4b83SYohann }
305241a4b83SYohann 
306ab213215SJeremy L Thompson //------------------------------------------------------------------------------
307ab213215SJeremy L Thompson // 2D transpose tensor contract and add x
308ab213215SJeremy L Thompson //------------------------------------------------------------------------------
309241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
310ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
31118d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
312241a4b83SYohann   __syncthreads();
31318d499f1SYohann   if (data.tidx < P1d && data.tidy < P1d)
314ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
31518d499f1SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
316241a4b83SYohann   __syncthreads();
317241a4b83SYohann }
318241a4b83SYohann 
319ab213215SJeremy L Thompson //------------------------------------------------------------------------------
320ab213215SJeremy L Thompson // 2D interpolate to quadrature points
321ab213215SJeremy L Thompson //------------------------------------------------------------------------------
322241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
323ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
324241a4b83SYohann   CeedScalar r_t[1];
325ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
326241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
327241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
328241a4b83SYohann   }
329241a4b83SYohann }
330241a4b83SYohann 
331ab213215SJeremy L Thompson //------------------------------------------------------------------------------
332ab213215SJeremy L Thompson // 2D interpolate transpose
333ab213215SJeremy L Thompson //------------------------------------------------------------------------------
334241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
335ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
336241a4b83SYohann   CeedScalar r_t[1];
337ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
338241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
339241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
340241a4b83SYohann   }
341241a4b83SYohann }
342241a4b83SYohann 
343ab213215SJeremy L Thompson //------------------------------------------------------------------------------
344ab213215SJeremy L Thompson // 2D derivatives at quadrature points
345ab213215SJeremy L Thompson //------------------------------------------------------------------------------
346241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
347ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
348241a4b83SYohann   CeedScalar r_t[1];
349ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
350241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
351241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
352241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
353241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
354241a4b83SYohann   }
355241a4b83SYohann }
356241a4b83SYohann 
357ab213215SJeremy L Thompson //------------------------------------------------------------------------------
358ab213215SJeremy L Thompson // 2D derivatives transpose
359ab213215SJeremy L Thompson //------------------------------------------------------------------------------
360241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
361ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
362241a4b83SYohann   CeedScalar r_t[1];
363ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
364241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
365241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
366241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
367241a4b83SYohann     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
368241a4b83SYohann   }
369241a4b83SYohann }
370241a4b83SYohann 
371ab213215SJeremy L Thompson //------------------------------------------------------------------------------
372241a4b83SYohann // 3D
373ab213215SJeremy L Thompson //------------------------------------------------------------------------------
374ab213215SJeremy L Thompson 
375ab213215SJeremy L Thompson //------------------------------------------------------------------------------
376ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
377ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3785c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
379053543deSYohann Dudouit inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
380ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
381241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
3824d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
383ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
384ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3855c7b696cSjeremylt         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
386241a4b83SYohann     }
387241a4b83SYohann }
388920dcdc4Sjeremylt 
389ab213215SJeremy L Thompson //------------------------------------------------------------------------------
390ab213215SJeremy L Thompson // L-vector -> E-vector, strided
391ab213215SJeremy L Thompson //------------------------------------------------------------------------------
392d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
393053543deSYohann Dudouit inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
394ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
395ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
396ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
397d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
398ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
399d80fc06aSjeremylt         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
400ccedf6b0Sjeremylt     }
401ccedf6b0Sjeremylt }
402241a4b83SYohann 
403ab213215SJeremy L Thompson //------------------------------------------------------------------------------
404ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided
405ab213215SJeremy L Thompson //------------------------------------------------------------------------------
4065c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d>
407053543deSYohann Dudouit inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
40818d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
409ac421f39SYohann     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
410920dcdc4Sjeremylt     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
411ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4125c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
413ac421f39SYohann   }
41418d499f1SYohann }
415ac421f39SYohann 
416ab213215SJeremy L Thompson //------------------------------------------------------------------------------
417ab213215SJeremy L Thompson // E-vector -> Q-vector, strided
418ab213215SJeremy L Thompson //------------------------------------------------------------------------------
419d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
420053543deSYohann Dudouit inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
42118d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
422920dcdc4Sjeremylt     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
423d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
424ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
425d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
426920dcdc4Sjeremylt   }
42718d499f1SYohann }
428920dcdc4Sjeremylt 
429ab213215SJeremy L Thompson //------------------------------------------------------------------------------
430ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
431ab213215SJeremy L Thompson //------------------------------------------------------------------------------
4325c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
433053543deSYohann Dudouit inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
434ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
435241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4364d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
437ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
438ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4395c7b696cSjeremylt         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
440241a4b83SYohann     }
441241a4b83SYohann }
442241a4b83SYohann 
443ab213215SJeremy L Thompson //------------------------------------------------------------------------------
444ab213215SJeremy L Thompson // E-vector -> L-vector, strided
445ab213215SJeremy L Thompson //------------------------------------------------------------------------------
446d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4472f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
448ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
449ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
450ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
451d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
452ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
453d80fc06aSjeremylt         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
454ccedf6b0Sjeremylt     }
455ccedf6b0Sjeremylt }
456ccedf6b0Sjeremylt 
457ab213215SJeremy L Thompson //------------------------------------------------------------------------------
458ab213215SJeremy L Thompson // 3D tensor contract x
459ab213215SJeremy L Thompson //------------------------------------------------------------------------------
460241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
461ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
462ac421f39SYohann   CeedScalar r_B[P1d];
463ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
464ac421f39SYohann     r_B[i] = B[i + data.tidx*P1d];
465ab213215SJeremy L Thompson 
466ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
46718d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
468241a4b83SYohann     __syncthreads();
469241a4b83SYohann     V[k] = 0.0;
47018d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
471ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
47218d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
473241a4b83SYohann     __syncthreads();
474241a4b83SYohann   }
475241a4b83SYohann }
476241a4b83SYohann 
477ab213215SJeremy L Thompson //------------------------------------------------------------------------------
478ab213215SJeremy L Thompson // 3D tensor contract y
479ab213215SJeremy L Thompson //------------------------------------------------------------------------------
480241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
481ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
482ac421f39SYohann   CeedScalar r_B[P1d];
483ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
484ac421f39SYohann     r_B[i] = B[i + data.tidy*P1d];
485ab213215SJeremy L Thompson 
486ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
48718d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
488241a4b83SYohann     __syncthreads();
489241a4b83SYohann     V[k] = 0.0;
49018d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
491ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
49218d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
493241a4b83SYohann     __syncthreads();
494241a4b83SYohann   }
495241a4b83SYohann }
496241a4b83SYohann 
497ab213215SJeremy L Thompson //------------------------------------------------------------------------------
498ab213215SJeremy L Thompson // 3D tensor contract z
499ab213215SJeremy L Thompson //------------------------------------------------------------------------------
500241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
501ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
502ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
503241a4b83SYohann     V[k] = 0.0;
50418d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
505ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
506ab213215SJeremy L Thompson         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
507241a4b83SYohann   }
508241a4b83SYohann }
509241a4b83SYohann 
510ab213215SJeremy L Thompson //------------------------------------------------------------------------------
511ab213215SJeremy L Thompson // 3D transpose tensor contract z
512ab213215SJeremy L Thompson //------------------------------------------------------------------------------
513241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
514ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
51518d499f1SYohann   for (CeedInt k = 0; k < P1d; ++k) {
516241a4b83SYohann     V[k] = 0.0;
51718d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
518ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
519ab213215SJeremy L Thompson         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
520241a4b83SYohann   }
521241a4b83SYohann }
522241a4b83SYohann 
523ab213215SJeremy L Thompson //------------------------------------------------------------------------------
524ab213215SJeremy L Thompson // 3D transpose tensor contract y
525ab213215SJeremy L Thompson //------------------------------------------------------------------------------
526241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
527ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
528ac421f39SYohann   CeedScalar r_B[Q1d];
529ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i)
530ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
531ab213215SJeremy L Thompson 
532ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
53318d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
534241a4b83SYohann     __syncthreads();
535241a4b83SYohann     V[k] = 0.0;
53618d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
537ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
53818d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
539ac421f39SYohann     __syncthreads();
540ac421f39SYohann   }
541ac421f39SYohann }
542ac421f39SYohann 
543ab213215SJeremy L Thompson //------------------------------------------------------------------------------
544ab213215SJeremy L Thompson // 3D transpose tensor contract add y
545ab213215SJeremy L Thompson //------------------------------------------------------------------------------
546ac421f39SYohann template <int NCOMP, int P1d, int Q1d>
547ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
548ac421f39SYohann   CeedScalar r_B[Q1d];
549ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
550ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
551ab213215SJeremy L Thompson 
552ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
55318d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
554ac421f39SYohann     __syncthreads();
55518d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
556ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
55718d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
558241a4b83SYohann     __syncthreads();
559241a4b83SYohann   }
560241a4b83SYohann }
561241a4b83SYohann 
562ab213215SJeremy L Thompson //------------------------------------------------------------------------------
563ab213215SJeremy L Thompson // 3D transpose tensor contract x
564ab213215SJeremy L Thompson //------------------------------------------------------------------------------
565241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
566ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
567ac421f39SYohann   CeedScalar r_B[Q1d];
568ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
569ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
570ab213215SJeremy L Thompson 
571ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
57218d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
573241a4b83SYohann     __syncthreads();
574241a4b83SYohann     V[k] = 0.0;
57518d499f1SYohann     if (data.tidx < P1d && data.tidy < P1d)
576ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
57718d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
578241a4b83SYohann     __syncthreads();
579241a4b83SYohann   }
580241a4b83SYohann }
581241a4b83SYohann 
582ab213215SJeremy L Thompson //------------------------------------------------------------------------------
583ab213215SJeremy L Thompson // 3D transpose tensor contract add x
584ab213215SJeremy L Thompson //------------------------------------------------------------------------------
585241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
586ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
587ac421f39SYohann   CeedScalar r_B[Q1d];
588ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
589ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
590ab213215SJeremy L Thompson 
591ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
59218d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
593241a4b83SYohann     __syncthreads();
59418d499f1SYohann     if (data.tidx < P1d && data.tidy < P1d)
595ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
59618d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
597241a4b83SYohann     __syncthreads();
598241a4b83SYohann   }
599241a4b83SYohann }
600241a4b83SYohann 
601ab213215SJeremy L Thompson //------------------------------------------------------------------------------
602ab213215SJeremy L Thompson // 3D interpolate to quadrature points
603ab213215SJeremy L Thompson //------------------------------------------------------------------------------
604241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
605ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
60618d499f1SYohann   CeedScalar r_t1[T1d];
60718d499f1SYohann   CeedScalar r_t2[T1d];
608ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
609241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
610241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
611241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
612241a4b83SYohann   }
613241a4b83SYohann }
614241a4b83SYohann 
615ab213215SJeremy L Thompson //------------------------------------------------------------------------------
616ab213215SJeremy L Thompson // 3D interpolate transpose
617ab213215SJeremy L Thompson //------------------------------------------------------------------------------
618241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
619ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
62018d499f1SYohann   CeedScalar r_t1[T1d];
62118d499f1SYohann   CeedScalar r_t2[T1d];
622ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
623241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
624241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
625241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
626241a4b83SYohann   }
627241a4b83SYohann }
628241a4b83SYohann 
629ab213215SJeremy L Thompson //------------------------------------------------------------------------------
630ab213215SJeremy L Thompson // 3D derivatives at quadrature points
631ab213215SJeremy L Thompson //------------------------------------------------------------------------------
632241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
633ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
63418d499f1SYohann   CeedScalar r_t1[T1d];
63518d499f1SYohann   CeedScalar r_t2[T1d];
636ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
637241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
638241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
639241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
640241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
641241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
642241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
643241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
644241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
645241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
646241a4b83SYohann   }
647241a4b83SYohann }
648241a4b83SYohann 
649ab213215SJeremy L Thompson //------------------------------------------------------------------------------
650ab213215SJeremy L Thompson // 3D derivatives transpose
651ab213215SJeremy L Thompson //------------------------------------------------------------------------------
652241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
653ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
65418d499f1SYohann   CeedScalar r_t1[T1d];
65518d499f1SYohann   CeedScalar r_t2[T1d];
656ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
657241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
658241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
659241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
660241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
661241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
662241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
663241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
664241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
665241a4b83SYohann     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
666241a4b83SYohann   }
667241a4b83SYohann }
668241a4b83SYohann 
669ab213215SJeremy L Thompson //------------------------------------------------------------------------------
670ab213215SJeremy L Thompson // 3D collocated derivatives computation
671ab213215SJeremy L Thompson //------------------------------------------------------------------------------
672ac421f39SYohann template <int NCOMP, int Q1d>
673ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
67418d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
675ac421f39SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
67618d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
677ac421f39SYohann       __syncthreads();
678ac421f39SYohann       // X derivative
679ac421f39SYohann       r_V[comp+0*NCOMP] = 0.0;
680ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
68118d499f1SYohann         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
682ac421f39SYohann       // Y derivative
683ac421f39SYohann       r_V[comp+1*NCOMP] = 0.0;
684ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
68518d499f1SYohann         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
686ac421f39SYohann       // Z derivative
687ac421f39SYohann       r_V[comp+2*NCOMP] = 0.0;
688ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
689ab213215SJeremy L Thompson         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
690ac421f39SYohann       __syncthreads();
691ac421f39SYohann     }
692ac421f39SYohann   }
69318d499f1SYohann }
694ac421f39SYohann 
695ab213215SJeremy L Thompson //------------------------------------------------------------------------------
696ab213215SJeremy L Thompson // 3D collocated derivatives transpose
697ab213215SJeremy L Thompson //------------------------------------------------------------------------------
698ac421f39SYohann template <int NCOMP, int Q1d>
699ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
70018d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
701ac421f39SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
702ac421f39SYohann       // X derivative
70318d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
704ac421f39SYohann       __syncthreads();
705ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
70618d499f1SYohann         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
707ac421f39SYohann       __syncthreads();
708ac421f39SYohann       // Y derivative
70918d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
710ac421f39SYohann       __syncthreads();
711ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
71218d499f1SYohann         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
713ac421f39SYohann       __syncthreads();
714ac421f39SYohann       // Z derivative
715ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
716ac421f39SYohann         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
717ac421f39SYohann     }
718ac421f39SYohann   }
71918d499f1SYohann }
720ac421f39SYohann 
721ab213215SJeremy L Thompson //------------------------------------------------------------------------------
722ab213215SJeremy L Thompson // 1D quadrature weights
723ab213215SJeremy L Thompson //------------------------------------------------------------------------------
724241a4b83SYohann template <int Q1d>
725053543deSYohann Dudouit inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
72618d499f1SYohann   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
727241a4b83SYohann }
728241a4b83SYohann 
729ab213215SJeremy L Thompson //------------------------------------------------------------------------------
730ab213215SJeremy L Thompson // 2D quadrature weights
731ab213215SJeremy L Thompson //------------------------------------------------------------------------------
732241a4b83SYohann template <int Q1d>
733053543deSYohann Dudouit inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
73418d499f1SYohann   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
73518d499f1SYohann         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
736241a4b83SYohann }
737241a4b83SYohann 
738ab213215SJeremy L Thompson //------------------------------------------------------------------------------
739ab213215SJeremy L Thompson // 3D quadrature weights
740ab213215SJeremy L Thompson //------------------------------------------------------------------------------
741241a4b83SYohann template <int Q1d>
742053543deSYohann Dudouit inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
74318d499f1SYohann   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
74418d499f1SYohann   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
745ac421f39SYohann   for (CeedInt z = 0; z < Q1d; ++z)
74618d499f1SYohann     w[z] = quad ? pw*qweight1d[z] : 0.0;
747241a4b83SYohann }
748241a4b83SYohann 
749241a4b83SYohann );
750ab213215SJeremy L Thompson //------------------------------------------------------------------------------
751ab213215SJeremy L Thompson // Build singe operator kernel
752ab213215SJeremy L Thompson //------------------------------------------------------------------------------
753241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
754241a4b83SYohann 
755241a4b83SYohann   using std::ostringstream;
756241a4b83SYohann   using std::string;
757241a4b83SYohann   int ierr;
758241a4b83SYohann   bool setupdone;
759e15f9bd0SJeremy L Thompson   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
760e15f9bd0SJeremy L Thompson   if (setupdone) return CEED_ERROR_SUCCESS;
761241a4b83SYohann   Ceed ceed;
762e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
763241a4b83SYohann   CeedOperator_Cuda_gen *data;
764e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
765241a4b83SYohann   CeedQFunction qf;
766241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
767e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
768e15f9bd0SJeremy L Thompson   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
76988db6d3bSJeremy L Thompson   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
7705c7b696cSjeremylt           numoutputfields, ncomp, dim = 0, lsize;
771e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
772e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
773241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
7747e7773b5SJeremy L Thompson   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
775e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
776241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
7777e7773b5SJeremy L Thompson   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
778e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
779241a4b83SYohann   CeedEvalMode emode;
780241a4b83SYohann   CeedBasis basis;
781241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
782241a4b83SYohann   CeedElemRestriction Erestrict;
783461525f5SNatalie Beams   CeedElemRestriction_Cuda *restr_data;
784241a4b83SYohann 
7850b454692Sjeremylt   // Check for restriction only identity operator
7860b454692Sjeremylt   bool is_identity_qf;
7870b454692Sjeremylt   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
7880b454692Sjeremylt   if (is_identity_qf) {
7890b454692Sjeremylt     CeedEvalMode emodein, emodeout;
7900b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
7910b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
7920b454692Sjeremylt     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
7930b454692Sjeremylt       // LCOV_EXCL_START
7940b454692Sjeremylt       return CeedError(ceed, CEED_ERROR_BACKEND,
7950b454692Sjeremylt                        "Backend does not implement restriction only identity operators");
7960b454692Sjeremylt     // LCOV_EXCL_STOP
7970b454692Sjeremylt   }
7980b454692Sjeremylt 
799241a4b83SYohann   ostringstream code;
800241a4b83SYohann   string devFunctions(deviceFunctions);
801241a4b83SYohann 
802f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
803f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
804f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
80588db6d3bSJeremy L Thompson   ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr);
8060d0321e0SJeremy L Thompson   ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr);
80780a9ef05SNatalie Beams   if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){
808f1a13f77SYohann Dudouit     code << atomicAdd;
809f1a13f77SYohann Dudouit   }
810f1a13f77SYohann Dudouit 
811241a4b83SYohann   code << devFunctions;
812241a4b83SYohann 
813241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
814c3c443faSYohann Dudouit   string qFunctionName(qf_data->qFunctionName);
815c3c443faSYohann Dudouit   string oper;
81614cce66cSYohann Dudouit   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
8174d537eeaSYohann 
8184d537eeaSYohann   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
81977841947SLeila Ghaffari   code << "#define CEED_QFUNCTION_HELPER inline __device__\n";
820e15f9bd0SJeremy L Thompson   code << "#define CeedPragmaSIMD\n";
821e15f9bd0SJeremy L Thompson   code << "#define CEED_ERROR_SUCCESS 0\n\n";
8221da99368SJeremy L Thompson 
8231da99368SJeremy L Thompson   // Find dim and Q1d
82464d3f0c0Sjeremylt   bool useCollograd = true;
82518d499f1SYohann   data->maxP1d = 0;
8261da99368SJeremy L Thompson   for (CeedInt i = 0; i < numinputfields; i++) {
827e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
8281da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
829e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8300f54b25eSjeremylt       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
831e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8321da99368SJeremy L Thompson 
8331da99368SJeremy L Thompson       // Check for collocated gradient
834437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8351da99368SJeremy L Thompson 
8361da99368SJeremy L Thompson       // Collect dim and Q1d
837e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8381da99368SJeremy L Thompson       bool isTensor;
839e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8401da99368SJeremy L Thompson       if (isTensor) {
841e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
842e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
84318d499f1SYohann         if (P1d>data->maxP1d) data->maxP1d = P1d;
8441da99368SJeremy L Thompson       } else {
845e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
846e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
847e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8481da99368SJeremy L Thompson         }
8491da99368SJeremy L Thompson     }
8501da99368SJeremy L Thompson   }
8511da99368SJeremy L Thompson   // Check output bases for Q1d, dim as well
8521da99368SJeremy L Thompson   //   The only imput basis might be CEED_BASIS_COLLOCATED
8531da99368SJeremy L Thompson   for (CeedInt i = 0; i < numoutputfields; i++) {
854e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
8550f54b25eSjeremylt 
8561da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
857e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8580f54b25eSjeremylt       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
859e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8600f54b25eSjeremylt 
8611da99368SJeremy L Thompson       // Collect dim and Q1d
862e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8631da99368SJeremy L Thompson       bool isTensor;
864e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8651da99368SJeremy L Thompson       if (isTensor) {
866e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
8671da99368SJeremy L Thompson       } else {
868e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
869e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
870e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8711da99368SJeremy L Thompson         }
8720f54b25eSjeremylt 
8730f54b25eSjeremylt       // Check for collocated gradient
874437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8751da99368SJeremy L Thompson     }
8761da99368SJeremy L Thompson   }
8771da99368SJeremy L Thompson   data->dim = dim;
8781da99368SJeremy L Thompson   data->Q1d = Q1d;
8791da99368SJeremy L Thompson 
8801da99368SJeremy L Thompson   // Define CEED_Q_VLA
88164d3f0c0Sjeremylt   if (dim != 3 || useCollograd) {
8821da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA 1\n\n";
8831da99368SJeremy L Thompson   } else {
8841da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
8851da99368SJeremy L Thompson   }
8861da99368SJeremy L Thompson 
887241a4b83SYohann   code << qFunction;
888241a4b83SYohann 
889241a4b83SYohann   // Setup
890818e0025SJeremy L Thompson   code << "\n// -----------------------------------------------------------------------------\n";
891c3c443faSYohann Dudouit   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
892241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
893241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
894e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8951da99368SJeremy L Thompson     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
896241a4b83SYohann       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
897241a4b83SYohann     }
898241a4b83SYohann   }
899241a4b83SYohann 
900241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
901241a4b83SYohann     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
902241a4b83SYohann   }
9031da99368SJeremy L Thompson 
904241a4b83SYohann   code << "  const CeedInt Dim = "<<dim<<";\n";
905241a4b83SYohann   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
9061da99368SJeremy L Thompson 
907241a4b83SYohann   code << "  extern __shared__ CeedScalar slice[];\n";
908241a4b83SYohann   code << "  BackendData data;\n";
909241a4b83SYohann   code << "  data.tidx = threadIdx.x;\n";
910241a4b83SYohann   code << "  data.tidy = threadIdx.y;\n";
911241a4b83SYohann   code << "  data.tidz = threadIdx.z;\n";
912241a4b83SYohann   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
91318d499f1SYohann   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
914920dcdc4Sjeremylt 
915818e0025SJeremy L Thompson   code << "\n  // -- Input field constants and basis data --\n";
916ac421f39SYohann   //Initialize constants, and matrices B and G
917241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
918818e0025SJeremy L Thompson     code << "  // ---- Input field "<<i<<" ----\n";
919241a4b83SYohann     // Get elemsize, emode, ncomp
920241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
921e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
922241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
923e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
924241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
925e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9264d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
927e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
928920dcdc4Sjeremylt 
929920dcdc4Sjeremylt     // Set field constants
930920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT) {
931e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
932920dcdc4Sjeremylt       if (basis != CEED_BASIS_COLLOCATED) {
933e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
934920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
935920dcdc4Sjeremylt       } else {
936920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
937920dcdc4Sjeremylt       }
938920dcdc4Sjeremylt       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
939920dcdc4Sjeremylt     }
940920dcdc4Sjeremylt 
941920dcdc4Sjeremylt     // Load basis data
942920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
943241a4b83SYohann     switch (emode) {
944241a4b83SYohann     case CEED_EVAL_NONE:
945241a4b83SYohann       break;
946241a4b83SYohann     case CEED_EVAL_INTERP:
947e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
948437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
94980a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
950241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
951241a4b83SYohann       break;
952241a4b83SYohann     case CEED_EVAL_GRAD:
953e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
954437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
95580a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
956241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
95764d3f0c0Sjeremylt       if (useCollograd) {
958437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_collo_grad_1d;
95980a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
960ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
961ac421f39SYohann       } else {
962437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_grad_1d;
96380a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
964241a4b83SYohann         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
965ac421f39SYohann       }
966241a4b83SYohann       break;
967241a4b83SYohann     case CEED_EVAL_WEIGHT:
968241a4b83SYohann       break; // No action
969241a4b83SYohann     case CEED_EVAL_DIV:
970241a4b83SYohann       break; // TODO: Not implemented
971241a4b83SYohann     case CEED_EVAL_CURL:
972241a4b83SYohann       break; // TODO: Not implemented
973241a4b83SYohann     }
974241a4b83SYohann   }
975920dcdc4Sjeremylt 
976818e0025SJeremy L Thompson   code << "\n  // -- Output field constants and basis data --\n";
977241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
978818e0025SJeremy L Thompson     code << "  // ---- Output field "<<i<<" ----\n";
979241a4b83SYohann     // Get elemsize, emode, ncomp
980241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
981e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
982241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
983e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
984241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
985e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9864d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
987e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
988920dcdc4Sjeremylt 
989920dcdc4Sjeremylt     // Set field constants
990e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
991920dcdc4Sjeremylt     if (basis != CEED_BASIS_COLLOCATED) {
992e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
993241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
994920dcdc4Sjeremylt     } else {
995920dcdc4Sjeremylt       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
996920dcdc4Sjeremylt     }
997920dcdc4Sjeremylt     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
998920dcdc4Sjeremylt 
999920dcdc4Sjeremylt     // Load basis data
1000920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1001920dcdc4Sjeremylt     switch (emode) {
1002920dcdc4Sjeremylt     case CEED_EVAL_NONE:
1003920dcdc4Sjeremylt       break; // No action
1004920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
1005e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1006437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
100780a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1008241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1009241a4b83SYohann       break;
1010241a4b83SYohann     case CEED_EVAL_GRAD:
1011e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1012437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
101380a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1014241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
101564d3f0c0Sjeremylt       if (useCollograd) {
1016437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_collo_grad_1d;
101780a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1018ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1019ac421f39SYohann       } else {
1020437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_grad_1d;
102180a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1022241a4b83SYohann         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1023ac421f39SYohann       }
1024241a4b83SYohann       break;
1025e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
1026241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1027241a4b83SYohann       Ceed ceed;
1028e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1029e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
1030241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1031241a4b83SYohann       break; // Should not occur
1032241a4b83SYohann     }
1033241a4b83SYohann     case CEED_EVAL_DIV:
1034241a4b83SYohann       break; // TODO: Not implemented
1035241a4b83SYohann     case CEED_EVAL_CURL:
1036241a4b83SYohann       break; // TODO: Not implemented
1037e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
1038241a4b83SYohann     }
1039241a4b83SYohann   }
1040818e0025SJeremy L Thompson   code << "\n  // -- Element loop --\n";
1041ac421f39SYohann   code << "  __syncthreads();\n";
1042241a4b83SYohann   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1043241a4b83SYohann   // Input basis apply if needed
1044ac421f39SYohann   // Generate the correct eval mode code for each input
1045818e0025SJeremy L Thompson   code << "    // -- Input field restrictions and basis actions --\n";
1046241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1047818e0025SJeremy L Thompson     code << "    // ---- Input field "<<i<<" ----\n";
1048241a4b83SYohann     // Get elemsize, emode, ncomp
1049241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1050e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1051241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1052e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1053241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1054e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10554d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1056e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1057920dcdc4Sjeremylt 
1058920dcdc4Sjeremylt     // Restriction
1059920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT &&
106064d3f0c0Sjeremylt         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1061241a4b83SYohann       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
10629a2291e3SJeremy L Thompson 
10639a2291e3SJeremy L Thompson       bool isStrided;
1064e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
10659a2291e3SJeremy L Thompson       if (!isStrided) {
10665c7b696cSjeremylt         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1067e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10685c7b696cSjeremylt         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
10695c7b696cSjeremylt         CeedInt compstride;
1070e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
10715c7b696cSjeremylt         code << "    // CompStride: "<<compstride<<"\n";
1072e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
10739a2291e3SJeremy L Thompson         data->indices.in[i] = restr_data->d_ind;
10745c7b696cSjeremylt         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1075ccedf6b0Sjeremylt       } else {
1076920dcdc4Sjeremylt         bool backendstrides;
1077fd364f38SJeremy L Thompson         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1078e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
107913585805SJeremy L Thompson         CeedInt nelem;
108013585805SJeremy L Thompson         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1081e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
108213585805SJeremy L Thompson         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1083920dcdc4Sjeremylt         if (!backendstrides) {
1084920dcdc4Sjeremylt           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1085e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
1086ccedf6b0Sjeremylt         }
1087920dcdc4Sjeremylt         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1088d80fc06aSjeremylt         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1089920dcdc4Sjeremylt       }
1090920dcdc4Sjeremylt     }
1091920dcdc4Sjeremylt 
1092920dcdc4Sjeremylt     // Basis action
1093920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1094920dcdc4Sjeremylt     switch (emode) {
1095920dcdc4Sjeremylt     case CEED_EVAL_NONE:
109664d3f0c0Sjeremylt       if (!useCollograd) {
1097920dcdc4Sjeremylt         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1098920dcdc4Sjeremylt       }
1099920dcdc4Sjeremylt       break;
1100920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
1101241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1102241a4b83SYohann       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1103241a4b83SYohann       break;
1104241a4b83SYohann     case CEED_EVAL_GRAD:
110564d3f0c0Sjeremylt       if (useCollograd) {
1106ac421f39SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1107ac421f39SYohann         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1108ac421f39SYohann       } else {
1109241a4b83SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1110241a4b83SYohann         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1111ac421f39SYohann       }
1112241a4b83SYohann       break;
1113241a4b83SYohann     case CEED_EVAL_WEIGHT:
1114241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1115e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1116e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1117437930d1SJeremy L Thompson       data->W = basis_data->d_q_weight_1d;
1118241a4b83SYohann       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1119241a4b83SYohann       break; // No action
1120241a4b83SYohann     case CEED_EVAL_DIV:
1121241a4b83SYohann       break; // TODO: Not implemented
1122241a4b83SYohann     case CEED_EVAL_CURL:
1123241a4b83SYohann       break; // TODO: Not implemented
1124241a4b83SYohann     }
1125241a4b83SYohann   }
1126ac421f39SYohann 
1127241a4b83SYohann   // Q function
1128818e0025SJeremy L Thompson   code << "\n    // -- Output field setup --\n";
1129241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1130818e0025SJeremy L Thompson       code << "\n    // ---- Output field "<<i<<" ----\n";
1131241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1132e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1133241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
1134241a4b83SYohann     {
113564d3f0c0Sjeremylt       if (useCollograd) {
1136ac421f39SYohann         //Accumulator for gradient slices
1137ac421f39SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1138ac421f39SYohann         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1139ac421f39SYohann         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1140ac421f39SYohann         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1141ac421f39SYohann         code << "      }\n";
1142ac421f39SYohann         code << "    }\n";
1143ac421f39SYohann       } else {
1144241a4b83SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1145241a4b83SYohann       }
1146ac421f39SYohann     }
1147241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1148241a4b83SYohann     {
1149241a4b83SYohann       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1150241a4b83SYohann     }
1151241a4b83SYohann   }
1152ac421f39SYohann   // We treat quadrature points per slice in 3d to save registers
115364d3f0c0Sjeremylt   if (useCollograd) {
1154920dcdc4Sjeremylt     code << "\n    // Note: Collocated Gradient\n";
1155ac421f39SYohann     code << "#pragma unroll\n";
1156ac421f39SYohann     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1157818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1158ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1159818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1160ac421f39SYohann       // Get elemsize, emode, ncomp
1161ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1162e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1163ac421f39SYohann       // Basis action
1164920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1165ac421f39SYohann       switch (emode) {
1166ac421f39SYohann       case CEED_EVAL_NONE:
1167ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11689a2291e3SJeremy L Thompson 
11699a2291e3SJeremy L Thompson         bool isStrided;
1170e15f9bd0SJeremy L Thompson         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1171e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1172e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
11739a2291e3SJeremy L Thompson         if (!isStrided) {
11745c7b696cSjeremylt           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1175e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11765c7b696cSjeremylt           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
11775c7b696cSjeremylt           CeedInt compstride;
1178e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
11795c7b696cSjeremylt           code << "      // CompStride: "<<compstride<<"\n";
1180e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
11819a2291e3SJeremy L Thompson           data->indices.in[i] = restr_data->d_ind;
11825c7b696cSjeremylt           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1183920dcdc4Sjeremylt         } else {
1184920dcdc4Sjeremylt           bool backendstrides;
1185fd364f38SJeremy L Thompson           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1186e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
118713585805SJeremy L Thompson           CeedInt nelem;
118813585805SJeremy L Thompson           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1189e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
119013585805SJeremy L Thompson           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1191920dcdc4Sjeremylt           if (!backendstrides) {
1192920dcdc4Sjeremylt             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1193e15f9bd0SJeremy L Thompson             CeedChkBackend(ierr);
1194920dcdc4Sjeremylt           }
1195920dcdc4Sjeremylt           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1196d80fc06aSjeremylt           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1197920dcdc4Sjeremylt         }
1198ac421f39SYohann         break;
1199ac421f39SYohann       case CEED_EVAL_INTERP:
1200ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1201ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1202ac421f39SYohann         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1203ac421f39SYohann         code << "      }\n";
1204ac421f39SYohann         break;
1205ac421f39SYohann       case CEED_EVAL_GRAD:
1206ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1207ac421f39SYohann         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1208ac421f39SYohann         break;
1209ac421f39SYohann       case CEED_EVAL_WEIGHT:
1210ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[1];\n";
1211ac421f39SYohann         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1212ac421f39SYohann         break; // No action
1213ac421f39SYohann       case CEED_EVAL_DIV:
1214ac421f39SYohann         break; // TODO: Not implemented
1215ac421f39SYohann       case CEED_EVAL_CURL:
1216ac421f39SYohann         break; // TODO: Not implemented
1217ac421f39SYohann       }
1218ac421f39SYohann     }
1219818e0025SJeremy L Thompson     code << "\n      // -- Output fields --\n";
1220ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1221818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1222ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1223e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1224ac421f39SYohann       // Basis action
1225ac421f39SYohann       switch (emode) {
1226ac421f39SYohann       case CEED_EVAL_NONE:
1227ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1228ac421f39SYohann         break; // No action
1229ac421f39SYohann       case CEED_EVAL_INTERP:
1230ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1231ac421f39SYohann         break;
1232ac421f39SYohann       case CEED_EVAL_GRAD:
1233ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1234ac421f39SYohann         break;
1235ac421f39SYohann       case CEED_EVAL_WEIGHT:
1236ac421f39SYohann         break; // Should not occur
1237ac421f39SYohann       case CEED_EVAL_DIV:
1238ac421f39SYohann         break; // TODO: Not implemented
1239ac421f39SYohann       case CEED_EVAL_CURL:
1240ac421f39SYohann         break; // TODO: Not implemented
1241ac421f39SYohann       }
1242ac421f39SYohann     }
1243ac421f39SYohann   } else {
1244920dcdc4Sjeremylt     code << "\n      // Note: No Collocated Gradient\n";
1245818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1246ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1247818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1248ac421f39SYohann       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1249ac421f39SYohann     }
1250818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1251ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1252818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1253ac421f39SYohann       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1254ac421f39SYohann     }
1255ac421f39SYohann   }
1256818e0025SJeremy L Thompson   code << "\n      // -- QFunction Inputs and outputs --\n";
12574d537eeaSYohann   code << "      CeedScalar* in["<<numinputfields<<"];\n";
12584d537eeaSYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1259818e0025SJeremy L Thompson     code << "      // ---- Input field "<<i<<" ----\n";
1260ac421f39SYohann     code << "      in["<<i<<"] = r_q"<<i<<";\n";
12614d537eeaSYohann   }
12624d537eeaSYohann   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
12634d537eeaSYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1264818e0025SJeremy L Thompson     code << "      // ---- Output field "<<i<<" ----\n";
1265ac421f39SYohann     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
12664d537eeaSYohann   }
1267818e0025SJeremy L Thompson   code << "\n      // -- Apply QFunction --\n";
1268ac421f39SYohann   code << "      "<<qFunctionName<<"(ctx, ";
126964d3f0c0Sjeremylt   if (dim != 3 || useCollograd) {
1270ac421f39SYohann     code << "1";
1271ac421f39SYohann   } else {
1272ac421f39SYohann     code << "Q1d";
1273ac421f39SYohann   }
1274ac421f39SYohann   code << ", in, out);\n";
127564d3f0c0Sjeremylt   if (useCollograd) {
1276920dcdc4Sjeremylt     code << "\n      // Note: Collocated Gradient\n";
1277818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1278ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1279818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1280ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1281e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1282ac421f39SYohann       // Basis action
1283920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1284ac421f39SYohann       switch (emode) {
1285ac421f39SYohann       case CEED_EVAL_NONE:
1286ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1287ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1288ac421f39SYohann         code << "      }\n";
1289ac421f39SYohann         break; // No action
1290ac421f39SYohann       case CEED_EVAL_INTERP:
1291ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1292ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1293ac421f39SYohann         code << "      }\n";
1294ac421f39SYohann         break;
1295ac421f39SYohann       case CEED_EVAL_GRAD:
1296ac421f39SYohann         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1297ac421f39SYohann         break;
1298ac421f39SYohann       case CEED_EVAL_WEIGHT:
1299ac421f39SYohann         break; // Should not occur
1300ac421f39SYohann       case CEED_EVAL_DIV:
1301ac421f39SYohann         break; // TODO: Not implemented
1302ac421f39SYohann       case CEED_EVAL_CURL:
1303ac421f39SYohann         break; // TODO: Not implemented
1304ac421f39SYohann       }
1305ac421f39SYohann     }
1306ac421f39SYohann     code << "    }\n";
1307ac421f39SYohann   }
1308241a4b83SYohann 
1309241a4b83SYohann   // Output basis apply if needed
1310ac421f39SYohann   // Generate the correct eval mode code for each output
1311818e0025SJeremy L Thompson   code << "\n    // -- Output field basis action and restrictions --\n";
1312241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1313818e0025SJeremy L Thompson     code << "    // ---- Output field "<<i<<" ----\n";
1314241a4b83SYohann     // Get elemsize, emode, ncomp
1315241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1316e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1317241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1318e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1319241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1320e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13214d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1322e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1323241a4b83SYohann     // Basis action
1324920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1325241a4b83SYohann     switch (emode) {
1326241a4b83SYohann     case CEED_EVAL_NONE:
1327920dcdc4Sjeremylt       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1328241a4b83SYohann       break; // No action
1329241a4b83SYohann     case CEED_EVAL_INTERP:
1330241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1331241a4b83SYohann       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1332241a4b83SYohann       break;
1333241a4b83SYohann     case CEED_EVAL_GRAD:
1334241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
133564d3f0c0Sjeremylt       if (useCollograd) {
1336ac421f39SYohann         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1337ac421f39SYohann       } else {
1338241a4b83SYohann         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1339ac421f39SYohann       }
1340241a4b83SYohann       break;
1341e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
1342241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1343241a4b83SYohann       Ceed ceed;
1344e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1345e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
1346241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1347241a4b83SYohann       break; // Should not occur
1348241a4b83SYohann     }
1349241a4b83SYohann     case CEED_EVAL_DIV:
1350241a4b83SYohann       break; // TODO: Not implemented
1351241a4b83SYohann     case CEED_EVAL_CURL:
1352241a4b83SYohann       break; // TODO: Not implemented
1353e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
1354241a4b83SYohann     }
1355920dcdc4Sjeremylt     // Restriction
13569a2291e3SJeremy L Thompson       bool isStrided;
1357e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
13589a2291e3SJeremy L Thompson     if (!isStrided) {
13595c7b696cSjeremylt       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1360e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13615c7b696cSjeremylt       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
13625c7b696cSjeremylt       CeedInt compstride;
1363e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
13645c7b696cSjeremylt       code << "    // CompStride: "<<compstride<<"\n";
1365e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
13669a2291e3SJeremy L Thompson       data->indices.out[i] = restr_data->d_ind;
13675c7b696cSjeremylt       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1368920dcdc4Sjeremylt     } else {
1369920dcdc4Sjeremylt       bool backendstrides;
1370fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1371e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
137213585805SJeremy L Thompson       CeedInt nelem;
137313585805SJeremy L Thompson       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1374e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
137513585805SJeremy L Thompson       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1376920dcdc4Sjeremylt       if (!backendstrides) {
1377920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1378e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
1379920dcdc4Sjeremylt       }
1380920dcdc4Sjeremylt       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1381d80fc06aSjeremylt       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1382920dcdc4Sjeremylt     }
1383241a4b83SYohann   }
1384241a4b83SYohann 
1385241a4b83SYohann   code << "  }\n";
1386818e0025SJeremy L Thompson   code << "}\n";
1387818e0025SJeremy L Thompson   code << "// -----------------------------------------------------------------------------\n\n";
1388241a4b83SYohann 
1389ab213215SJeremy L Thompson   // View kernel for debugging
1390*46dc0734SJeremy L Thompson   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
13913f21f6b1SJeremy L Thompson   CeedDebug(ceed, code.str().c_str());
1392241a4b83SYohann 
139318d499f1SYohann   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
139418d499f1SYohann                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1395e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
1396c3c443faSYohann Dudouit   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1397e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
1398241a4b83SYohann 
1399e15f9bd0SJeremy L Thompson   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1400e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
1401241a4b83SYohann }
1402ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1403