xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision e9f4dca0bc4c408418619183f355fdd735c60606)
1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4241a4b83SYohann //
5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7241a4b83SYohann // element discretizations for exascale applications. For more information and
8241a4b83SYohann // source code availability see http://github.com/ceed.
9241a4b83SYohann //
10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
16b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12
177df94212SJeremy L Thompson 
18241a4b83SYohann #include "ceed-cuda-gen.h"
19241a4b83SYohann #include <iostream>
20241a4b83SYohann #include <sstream>
21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h"
22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
23241a4b83SYohann 
24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
25ab213215SJeremy L Thompson //------------------------------------------------------------------------------
26ab213215SJeremy L Thompson // Atomic add, for older CUDA
27ab213215SJeremy L Thompson //------------------------------------------------------------------------------
28241a4b83SYohann __device__ double atomicAdd(double *address, double val) {
29241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
30241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
31241a4b83SYohann   do {
32241a4b83SYohann     assumed = old;
33241a4b83SYohann     old =
34241a4b83SYohann       atomicCAS(address_as_ull, assumed,
35241a4b83SYohann                 __double_as_longlong(val +
36241a4b83SYohann                                      __longlong_as_double(assumed)));
37241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
38241a4b83SYohann     // (since NaN != NaN)
39241a4b83SYohann   } while (assumed != old);
40241a4b83SYohann   return __longlong_as_double(old);
41241a4b83SYohann }
42f1a13f77SYohann Dudouit );
43f1a13f77SYohann Dudouit 
44f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
45f1a13f77SYohann Dudouit 
46ab213215SJeremy L Thompson //------------------------------------------------------------------------------
47ab213215SJeremy L Thompson // Typedefs
48ab213215SJeremy L Thompson //------------------------------------------------------------------------------
49f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
50f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
51f1a13f77SYohann Dudouit 
52f1a13f77SYohann Dudouit typedef struct {
53f1a13f77SYohann Dudouit   CeedInt tidx;
54f1a13f77SYohann Dudouit   CeedInt tidy;
55f1a13f77SYohann Dudouit   CeedInt tidz;
56f1a13f77SYohann Dudouit   CeedInt tid;
57f1a13f77SYohann Dudouit   CeedScalar* slice;
58f1a13f77SYohann Dudouit } BackendData;
59241a4b83SYohann 
60ab213215SJeremy L Thompson //------------------------------------------------------------------------------
61ab213215SJeremy L Thompson // Load matrices for basis actions
62ab213215SJeremy L Thompson //------------------------------------------------------------------------------
63241a4b83SYohann template <int P, int Q>
64241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
65ab213215SJeremy L Thompson   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
66241a4b83SYohann     B[i] = d_B[i];
67241a4b83SYohann }
68241a4b83SYohann 
69ab213215SJeremy L Thompson //------------------------------------------------------------------------------
70241a4b83SYohann // 1D
71ab213215SJeremy L Thompson //------------------------------------------------------------------------------
72ab213215SJeremy L Thompson 
73ab213215SJeremy L Thompson //------------------------------------------------------------------------------
74ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
75ab213215SJeremy L Thompson //------------------------------------------------------------------------------
765c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
775c7b696cSjeremylt inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
78ab213215SJeremy L Thompson   if (data.tidx < P1d) {
794d537eeaSYohann     const CeedInt node = data.tidx;
80ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
81ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
825c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
83241a4b83SYohann   }
84241a4b83SYohann }
85920dcdc4Sjeremylt 
86ab213215SJeremy L Thompson //------------------------------------------------------------------------------
87ab213215SJeremy L Thompson // L-vector -> E-vector, strided
88ab213215SJeremy L Thompson //------------------------------------------------------------------------------
89d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
90d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
91ab213215SJeremy L Thompson   if (data.tidx < P1d) {
92ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
93d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
94ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
95d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
96ccedf6b0Sjeremylt   }
97ccedf6b0Sjeremylt }
98241a4b83SYohann 
99ab213215SJeremy L Thompson //------------------------------------------------------------------------------
100ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
101ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1025c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
1035c7b696cSjeremylt inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
104ab213215SJeremy L Thompson   if (data.tidx < P1d) {
1054d537eeaSYohann     const CeedInt node = data.tidx;
106ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
107ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1085c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
109241a4b83SYohann   }
110241a4b83SYohann }
111241a4b83SYohann 
112ab213215SJeremy L Thompson //------------------------------------------------------------------------------
113ab213215SJeremy L Thompson // E-vector -> L-vector, strided
114ab213215SJeremy L Thompson //------------------------------------------------------------------------------
115d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
116d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
117ab213215SJeremy L Thompson   if (data.tidx < P1d) {
118ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
119d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
120ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
121d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
122ccedf6b0Sjeremylt   }
123ccedf6b0Sjeremylt }
124ccedf6b0Sjeremylt 
125ab213215SJeremy L Thompson //------------------------------------------------------------------------------
126ab213215SJeremy L Thompson // 1D tensor contraction x
127ab213215SJeremy L Thompson //------------------------------------------------------------------------------
128241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
129ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130241a4b83SYohann   data.slice[data.tidx] = *U;
131241a4b83SYohann   __syncthreads();
132241a4b83SYohann   *V = 0.0;
133ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
134ab213215SJeremy L Thompson     *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
135241a4b83SYohann   __syncthreads();
136241a4b83SYohann }
137241a4b83SYohann 
138ab213215SJeremy L Thompson //------------------------------------------------------------------------------
139ab213215SJeremy L Thompson // 1D transpose tensor contraction x
140ab213215SJeremy L Thompson //------------------------------------------------------------------------------
141241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
142ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
143241a4b83SYohann   data.slice[data.tidx] = *U;
144241a4b83SYohann   __syncthreads();
145241a4b83SYohann   *V = 0.0;
146ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
147ab213215SJeremy L Thompson     *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
148241a4b83SYohann   __syncthreads();
149241a4b83SYohann }
150241a4b83SYohann 
151ab213215SJeremy L Thompson //------------------------------------------------------------------------------
152ab213215SJeremy L Thompson // 1D interpolate to quadrature points
153ab213215SJeremy L Thompson //------------------------------------------------------------------------------
154241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
155ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
156ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
157241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
158241a4b83SYohann }
159241a4b83SYohann 
160ab213215SJeremy L Thompson //------------------------------------------------------------------------------
161ab213215SJeremy L Thompson // 1D interpolate transpose
162ab213215SJeremy L Thompson //------------------------------------------------------------------------------
163241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
164ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
165ab213215SJeremy L Thompson   for (CeedInt comp=0; comp<NCOMP; comp++)
166241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
167241a4b83SYohann }
168241a4b83SYohann 
169ab213215SJeremy L Thompson //------------------------------------------------------------------------------
170ab213215SJeremy L Thompson // 1D derivatives at quadrature points
171ab213215SJeremy L Thompson //------------------------------------------------------------------------------
172241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
173ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
174ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
175241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
176241a4b83SYohann }
177241a4b83SYohann 
178ab213215SJeremy L Thompson //------------------------------------------------------------------------------
179ab213215SJeremy L Thompson // 1D derivatives transpose
180ab213215SJeremy L Thompson //------------------------------------------------------------------------------
181241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
182ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
183ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
184241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
185241a4b83SYohann }
186241a4b83SYohann 
187ab213215SJeremy L Thompson //------------------------------------------------------------------------------
188241a4b83SYohann // 2D
189ab213215SJeremy L Thompson //------------------------------------------------------------------------------
190ab213215SJeremy L Thompson 
191ab213215SJeremy L Thompson //------------------------------------------------------------------------------
192ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
193ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1945c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
1955c7b696cSjeremylt inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
196ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
1974d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
198ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
199ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2005c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
201241a4b83SYohann   }
202241a4b83SYohann }
203920dcdc4Sjeremylt 
204ab213215SJeremy L Thompson //------------------------------------------------------------------------------
205ab213215SJeremy L Thompson // L-vector -> E-vector, strided
206ab213215SJeremy L Thompson //------------------------------------------------------------------------------
207d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
208d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
209ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
210ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
211d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
212ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
213d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
214ccedf6b0Sjeremylt   }
215ccedf6b0Sjeremylt }
216241a4b83SYohann 
217ab213215SJeremy L Thompson //------------------------------------------------------------------------------
218ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
219ab213215SJeremy L Thompson //------------------------------------------------------------------------------
2205c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
2215c7b696cSjeremylt inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
222ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
2234d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
224ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
225ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2265c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
227241a4b83SYohann   }
228241a4b83SYohann }
229241a4b83SYohann 
230ab213215SJeremy L Thompson //------------------------------------------------------------------------------
231ab213215SJeremy L Thompson // E-vector -> L-vector, strided
232ab213215SJeremy L Thompson //------------------------------------------------------------------------------
233d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
234d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
235ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
236ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
237d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
238ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
239d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
240ccedf6b0Sjeremylt   }
241ccedf6b0Sjeremylt }
242ccedf6b0Sjeremylt 
243ab213215SJeremy L Thompson //------------------------------------------------------------------------------
244ab213215SJeremy L Thompson // 2D tensor contraction x
245ab213215SJeremy L Thompson //------------------------------------------------------------------------------
246241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
247ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
248241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
249241a4b83SYohann   __syncthreads();
250241a4b83SYohann   *V = 0.0;
251ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
252ab213215SJeremy L Thompson     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
253241a4b83SYohann   __syncthreads();
254241a4b83SYohann }
255241a4b83SYohann 
256ab213215SJeremy L Thompson //------------------------------------------------------------------------------
257ab213215SJeremy L Thompson // 2D tensor contract y
258ab213215SJeremy L Thompson //------------------------------------------------------------------------------
259241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
260ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
261241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
262241a4b83SYohann   __syncthreads();
263241a4b83SYohann   *V = 0.0;
264ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
265ab213215SJeremy L Thompson     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
266241a4b83SYohann   __syncthreads();
267241a4b83SYohann }
268241a4b83SYohann 
269ab213215SJeremy L Thompson //------------------------------------------------------------------------------
270ab213215SJeremy L Thompson // 2D transpose tensor contract y
271ab213215SJeremy L Thompson //------------------------------------------------------------------------------
272241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
273ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
274241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
275241a4b83SYohann   __syncthreads();
276241a4b83SYohann   *V = 0.0;
277ab213215SJeremy L Thompson   if (data.tidy < P1d)
278ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
279ab213215SJeremy L Thompson       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
280241a4b83SYohann   __syncthreads();
281241a4b83SYohann }
282241a4b83SYohann 
283ab213215SJeremy L Thompson //------------------------------------------------------------------------------
284ab213215SJeremy L Thompson // 2D transpose tensor contract x
285ab213215SJeremy L Thompson //------------------------------------------------------------------------------
286241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
287ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
288241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
289241a4b83SYohann   __syncthreads();
290241a4b83SYohann   *V = 0.0;
291ab213215SJeremy L Thompson   if (data.tidx < P1d)
292ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
293ab213215SJeremy L Thompson       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
294241a4b83SYohann   __syncthreads();
295241a4b83SYohann }
296241a4b83SYohann 
297ab213215SJeremy L Thompson //------------------------------------------------------------------------------
298ab213215SJeremy L Thompson // 2D transpose tensor contract and add x
299ab213215SJeremy L Thompson //------------------------------------------------------------------------------
300241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
301ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
302241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
303241a4b83SYohann   __syncthreads();
304ab213215SJeremy L Thompson   if (data.tidx < P1d)
305ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
306ab213215SJeremy L Thompson       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
307241a4b83SYohann   __syncthreads();
308241a4b83SYohann }
309241a4b83SYohann 
310ab213215SJeremy L Thompson //------------------------------------------------------------------------------
311ab213215SJeremy L Thompson // 2D interpolate to quadrature points
312ab213215SJeremy L Thompson //------------------------------------------------------------------------------
313241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
314ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
315241a4b83SYohann   CeedScalar r_t[1];
316ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
317241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
318241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
319241a4b83SYohann   }
320241a4b83SYohann }
321241a4b83SYohann 
322ab213215SJeremy L Thompson //------------------------------------------------------------------------------
323ab213215SJeremy L Thompson // 2D interpolate transpose
324ab213215SJeremy L Thompson //------------------------------------------------------------------------------
325241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
326ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
327241a4b83SYohann   CeedScalar r_t[1];
328ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
329241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
330241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
331241a4b83SYohann   }
332241a4b83SYohann }
333241a4b83SYohann 
334ab213215SJeremy L Thompson //------------------------------------------------------------------------------
335ab213215SJeremy L Thompson // 2D derivatives at quadrature points
336ab213215SJeremy L Thompson //------------------------------------------------------------------------------
337241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
338ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
339241a4b83SYohann   CeedScalar r_t[1];
340ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
341241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
342241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
343241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
344241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
345241a4b83SYohann   }
346241a4b83SYohann }
347241a4b83SYohann 
348ab213215SJeremy L Thompson //------------------------------------------------------------------------------
349ab213215SJeremy L Thompson // 2D derivatives transpose
350ab213215SJeremy L Thompson //------------------------------------------------------------------------------
351241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
352ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
353241a4b83SYohann   CeedScalar r_t[1];
354ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
355241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
356241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
357241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
358241a4b83SYohann     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
359241a4b83SYohann   }
360241a4b83SYohann }
361241a4b83SYohann 
362ab213215SJeremy L Thompson //------------------------------------------------------------------------------
363241a4b83SYohann // 3D
364ab213215SJeremy L Thompson //------------------------------------------------------------------------------
365ab213215SJeremy L Thompson 
366ab213215SJeremy L Thompson //------------------------------------------------------------------------------
367ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
368ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3695c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
3705c7b696cSjeremylt inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
371ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
372241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
3734d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
374ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
375ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3765c7b696cSjeremylt         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
377241a4b83SYohann     }
378241a4b83SYohann }
379920dcdc4Sjeremylt 
380ab213215SJeremy L Thompson //------------------------------------------------------------------------------
381ab213215SJeremy L Thompson // L-vector -> E-vector, strided
382ab213215SJeremy L Thompson //------------------------------------------------------------------------------
383d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
384d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
385ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
386ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
387ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
388d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
389ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
390d80fc06aSjeremylt         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
391ccedf6b0Sjeremylt     }
392ccedf6b0Sjeremylt }
393241a4b83SYohann 
394ab213215SJeremy L Thompson //------------------------------------------------------------------------------
395ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided
396ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3975c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d>
3985c7b696cSjeremylt inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
399ac421f39SYohann   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
400920dcdc4Sjeremylt   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
401ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; ++comp)
4025c7b696cSjeremylt     r_u[comp] = d_u[ind + COMPSTRIDE * comp];
403ac421f39SYohann }
404ac421f39SYohann 
405ab213215SJeremy L Thompson //------------------------------------------------------------------------------
406ab213215SJeremy L Thompson // E-vector -> Q-vector, strided
407ab213215SJeremy L Thompson //------------------------------------------------------------------------------
408d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
40925dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
410920dcdc4Sjeremylt   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
411d80fc06aSjeremylt   const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
412ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; ++comp)
413d80fc06aSjeremylt     r_u[comp] = d_u[ind + comp * STRIDES_COMP];
414920dcdc4Sjeremylt }
415920dcdc4Sjeremylt 
416ab213215SJeremy L Thompson //------------------------------------------------------------------------------
417ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
418ab213215SJeremy L Thompson //------------------------------------------------------------------------------
4195c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
4205c7b696cSjeremylt inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
421ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
422241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4234d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
424ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
425ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4265c7b696cSjeremylt         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
427241a4b83SYohann     }
428241a4b83SYohann }
429241a4b83SYohann 
430ab213215SJeremy L Thompson //------------------------------------------------------------------------------
431ab213215SJeremy L Thompson // E-vector -> L-vector, strided
432ab213215SJeremy L Thompson //------------------------------------------------------------------------------
433d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4342f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
435ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
436ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
437ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
438d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
439ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
440d80fc06aSjeremylt         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
441ccedf6b0Sjeremylt     }
442ccedf6b0Sjeremylt }
443ccedf6b0Sjeremylt 
444ab213215SJeremy L Thompson //------------------------------------------------------------------------------
445ab213215SJeremy L Thompson // 3D tensor contract x
446ab213215SJeremy L Thompson //------------------------------------------------------------------------------
447241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
448ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
449ac421f39SYohann   CeedScalar r_B[P1d];
450ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
451ac421f39SYohann     r_B[i] = B[i + data.tidx*P1d];
452ab213215SJeremy L Thompson 
453ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
454241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
455241a4b83SYohann     __syncthreads();
456241a4b83SYohann     V[k] = 0.0;
457ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
458ab213215SJeremy L Thompson       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
459241a4b83SYohann     __syncthreads();
460241a4b83SYohann   }
461241a4b83SYohann }
462241a4b83SYohann 
463ab213215SJeremy L Thompson //------------------------------------------------------------------------------
464ab213215SJeremy L Thompson // 3D tensor contract y
465ab213215SJeremy L Thompson //------------------------------------------------------------------------------
466241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
467ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
468ac421f39SYohann   CeedScalar r_B[P1d];
469ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
470ac421f39SYohann     r_B[i] = B[i + data.tidy*P1d];
471ab213215SJeremy L Thompson 
472ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
473241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
474241a4b83SYohann     __syncthreads();
475241a4b83SYohann     V[k] = 0.0;
476ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
477ab213215SJeremy L Thompson       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
478241a4b83SYohann     __syncthreads();
479241a4b83SYohann   }
480241a4b83SYohann }
481241a4b83SYohann 
482ab213215SJeremy L Thompson //------------------------------------------------------------------------------
483ab213215SJeremy L Thompson // 3D tensor contract z
484ab213215SJeremy L Thompson //------------------------------------------------------------------------------
485241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
486ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
487ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
488241a4b83SYohann     V[k] = 0.0;
489ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
490ab213215SJeremy L Thompson       V[k] += B[i + k*P1d] * U[i]; // Contract z direction
491241a4b83SYohann   }
492241a4b83SYohann }
493241a4b83SYohann 
494ab213215SJeremy L Thompson //------------------------------------------------------------------------------
495ab213215SJeremy L Thompson // 3D transpose tensor contract z
496ab213215SJeremy L Thompson //------------------------------------------------------------------------------
497241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
498ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
499ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
500241a4b83SYohann     V[k] = 0.0;
501ab213215SJeremy L Thompson     if (k < P1d)
502ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
503ab213215SJeremy L Thompson         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
504241a4b83SYohann   }
505241a4b83SYohann }
506241a4b83SYohann 
507ab213215SJeremy L Thompson //------------------------------------------------------------------------------
508ab213215SJeremy L Thompson // 3D transpose tensor contract y
509ab213215SJeremy L Thompson //------------------------------------------------------------------------------
510241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
511ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
512ac421f39SYohann   CeedScalar r_B[Q1d];
513ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i)
514ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
515ab213215SJeremy L Thompson 
516ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
517241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
518241a4b83SYohann     __syncthreads();
519241a4b83SYohann     V[k] = 0.0;
520ab213215SJeremy L Thompson     if (data.tidy < P1d)
521ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
522ab213215SJeremy L Thompson         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
523ac421f39SYohann     __syncthreads();
524ac421f39SYohann   }
525ac421f39SYohann }
526ac421f39SYohann 
527ab213215SJeremy L Thompson //------------------------------------------------------------------------------
528ab213215SJeremy L Thompson // 3D transpose tensor contract add y
529ab213215SJeremy L Thompson //------------------------------------------------------------------------------
530ac421f39SYohann template <int NCOMP, int P1d, int Q1d>
531ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
532ac421f39SYohann   CeedScalar r_B[Q1d];
533ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
534ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
535ab213215SJeremy L Thompson 
536ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
537ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
538ac421f39SYohann     __syncthreads();
539ab213215SJeremy L Thompson     if (data.tidy < P1d)
540ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
541ab213215SJeremy L Thompson         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
542241a4b83SYohann     __syncthreads();
543241a4b83SYohann   }
544241a4b83SYohann }
545241a4b83SYohann 
546ab213215SJeremy L Thompson //------------------------------------------------------------------------------
547ab213215SJeremy L Thompson // 3D transpose tensor contract x
548ab213215SJeremy L Thompson //------------------------------------------------------------------------------
549241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
550ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
551ac421f39SYohann   CeedScalar r_B[Q1d];
552ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
553ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
554ab213215SJeremy L Thompson 
555ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
556241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
557241a4b83SYohann     __syncthreads();
558241a4b83SYohann     V[k] = 0.0;
559ab213215SJeremy L Thompson     if (data.tidx < P1d)
560ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
561ab213215SJeremy L Thompson         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
562241a4b83SYohann     __syncthreads();
563241a4b83SYohann   }
564241a4b83SYohann }
565241a4b83SYohann 
566ab213215SJeremy L Thompson //------------------------------------------------------------------------------
567ab213215SJeremy L Thompson // 3D transpose tensor contract add x
568ab213215SJeremy L Thompson //------------------------------------------------------------------------------
569241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
570ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
571ac421f39SYohann   CeedScalar r_B[Q1d];
572ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
573ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
574ab213215SJeremy L Thompson 
575ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
576241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
577241a4b83SYohann     __syncthreads();
578ab213215SJeremy L Thompson     if (data.tidx < P1d)
579ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
580ab213215SJeremy L Thompson         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
581241a4b83SYohann     __syncthreads();
582241a4b83SYohann   }
583241a4b83SYohann }
584241a4b83SYohann 
585ab213215SJeremy L Thompson //------------------------------------------------------------------------------
586ab213215SJeremy L Thompson // 3D interpolate to quadrature points
587ab213215SJeremy L Thompson //------------------------------------------------------------------------------
588241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
589ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
590241a4b83SYohann   CeedScalar r_t1[Q1d];
591241a4b83SYohann   CeedScalar r_t2[Q1d];
592ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
593241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
594241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
595241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
596241a4b83SYohann   }
597241a4b83SYohann }
598241a4b83SYohann 
599ab213215SJeremy L Thompson //------------------------------------------------------------------------------
600ab213215SJeremy L Thompson // 3D interpolate transpose
601ab213215SJeremy L Thompson //------------------------------------------------------------------------------
602241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
603ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
604241a4b83SYohann   CeedScalar r_t1[Q1d];
605241a4b83SYohann   CeedScalar r_t2[Q1d];
606ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
607241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
608241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
609241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
610241a4b83SYohann   }
611241a4b83SYohann }
612241a4b83SYohann 
613ab213215SJeremy L Thompson //------------------------------------------------------------------------------
614ab213215SJeremy L Thompson // 3D derivatives at quadrature points
615ab213215SJeremy L Thompson //------------------------------------------------------------------------------
616241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
617ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
618241a4b83SYohann   CeedScalar r_t1[Q1d];
619241a4b83SYohann   CeedScalar r_t2[Q1d];
620ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
621241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
622241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
623241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
624241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
625241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
626241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
627241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
628241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
629241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
630241a4b83SYohann   }
631241a4b83SYohann }
632241a4b83SYohann 
633ab213215SJeremy L Thompson //------------------------------------------------------------------------------
634ab213215SJeremy L Thompson // 3D derivatives transpose
635ab213215SJeremy L Thompson //------------------------------------------------------------------------------
636241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
637ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
638241a4b83SYohann   CeedScalar r_t1[Q1d];
639241a4b83SYohann   CeedScalar r_t2[Q1d];
640ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
641241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
642241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
643241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
644241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
645241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
646241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
647241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
648241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
649241a4b83SYohann     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
650241a4b83SYohann   }
651241a4b83SYohann }
652241a4b83SYohann 
653ab213215SJeremy L Thompson //------------------------------------------------------------------------------
654ab213215SJeremy L Thompson // 3D collocated derivatives computation
655ab213215SJeremy L Thompson //------------------------------------------------------------------------------
656ac421f39SYohann template <int NCOMP, int Q1d>
657ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
658ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
659ac421f39SYohann     data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d];
660ac421f39SYohann     __syncthreads();
661ac421f39SYohann     // X derivative
662ac421f39SYohann     r_V[comp+0*NCOMP] = 0.0;
663ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
664ab213215SJeremy L Thompson       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
665ac421f39SYohann     // Y derivative
666ac421f39SYohann     r_V[comp+1*NCOMP] = 0.0;
667ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
668ab213215SJeremy L Thompson       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
669ac421f39SYohann     // Z derivative
670ac421f39SYohann     r_V[comp+2*NCOMP] = 0.0;
671ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
672ab213215SJeremy L Thompson       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
673ac421f39SYohann     __syncthreads();
674ac421f39SYohann   }
675ac421f39SYohann }
676ac421f39SYohann 
677ab213215SJeremy L Thompson //------------------------------------------------------------------------------
678ab213215SJeremy L Thompson // 3D collocated derivatives transpose
679ab213215SJeremy L Thompson //------------------------------------------------------------------------------
680ac421f39SYohann template <int NCOMP, int Q1d>
681ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
682ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
683ac421f39SYohann     // X derivative
684ac421f39SYohann     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP];
685ac421f39SYohann     __syncthreads();
686ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
687ab213215SJeremy L Thompson       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
688ac421f39SYohann     __syncthreads();
689ac421f39SYohann     // Y derivative
690ac421f39SYohann     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP];
691ac421f39SYohann     __syncthreads();
692ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
693ab213215SJeremy L Thompson       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
694ac421f39SYohann     __syncthreads();
695ac421f39SYohann     // Z derivative
696ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
697ac421f39SYohann       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
698ac421f39SYohann   }
699ac421f39SYohann }
700ac421f39SYohann 
701ab213215SJeremy L Thompson //------------------------------------------------------------------------------
702ab213215SJeremy L Thompson // 1D quadrature weights
703ab213215SJeremy L Thompson //------------------------------------------------------------------------------
704241a4b83SYohann template <int Q1d>
705241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
706241a4b83SYohann   *w = qweight1d[data.tidx];
707241a4b83SYohann }
708241a4b83SYohann 
709ab213215SJeremy L Thompson //------------------------------------------------------------------------------
710ab213215SJeremy L Thompson // 2D quadrature weights
711ab213215SJeremy L Thompson //------------------------------------------------------------------------------
712241a4b83SYohann template <int Q1d>
713241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
714241a4b83SYohann   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
715241a4b83SYohann }
716241a4b83SYohann 
717ab213215SJeremy L Thompson //------------------------------------------------------------------------------
718ab213215SJeremy L Thompson // 3D quadrature weights
719ab213215SJeremy L Thompson //------------------------------------------------------------------------------
720241a4b83SYohann template <int Q1d>
721241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
722241a4b83SYohann   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
723ac421f39SYohann   for (CeedInt z = 0; z < Q1d; ++z)
724241a4b83SYohann     w[z] = pw*qweight1d[z];
725241a4b83SYohann }
726241a4b83SYohann 
727241a4b83SYohann );
728ab213215SJeremy L Thompson //------------------------------------------------------------------------------
729ab213215SJeremy L Thompson // Build singe operator kernel
730ab213215SJeremy L Thompson //------------------------------------------------------------------------------
731241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
732241a4b83SYohann 
733241a4b83SYohann   using std::ostringstream;
734241a4b83SYohann   using std::string;
735241a4b83SYohann   int ierr;
736241a4b83SYohann   bool setupdone;
737fd364f38SJeremy L Thompson   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
738241a4b83SYohann   if (setupdone) return 0;
739241a4b83SYohann   Ceed ceed;
740241a4b83SYohann   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
741241a4b83SYohann   CeedOperator_Cuda_gen *data;
742241a4b83SYohann   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
743241a4b83SYohann   CeedQFunction qf;
744241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
745241a4b83SYohann   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
746241a4b83SYohann   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
7471da99368SJeremy L Thompson   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
7485c7b696cSjeremylt           numoutputfields, ncomp, dim = 0, lsize;
749241a4b83SYohann   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
750241a4b83SYohann   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
751241a4b83SYohann   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
752241a4b83SYohann   CeedChk(ierr);
753241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
754241a4b83SYohann   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
755241a4b83SYohann   CeedChk(ierr);
756241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
757241a4b83SYohann   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
758241a4b83SYohann   CeedChk(ierr);
759241a4b83SYohann   CeedEvalMode emode;
760241a4b83SYohann   CeedBasis basis;
761241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
762241a4b83SYohann   CeedElemRestriction Erestrict;
763241a4b83SYohann   CeedElemRestriction_Cuda_reg *restr_data;
764241a4b83SYohann 
765241a4b83SYohann   ostringstream code;
766241a4b83SYohann   string devFunctions(deviceFunctions);
767241a4b83SYohann 
768f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
769f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
770f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
771abfaacbbSSander Arens   ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr);
772f1a13f77SYohann Dudouit   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
773f1a13f77SYohann Dudouit   if (prop.major<6){
774f1a13f77SYohann Dudouit     code << atomicAdd;
775f1a13f77SYohann Dudouit   }
776f1a13f77SYohann Dudouit 
777241a4b83SYohann   code << devFunctions;
778241a4b83SYohann 
779241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
780c3c443faSYohann Dudouit   string qFunctionName(qf_data->qFunctionName);
781c3c443faSYohann Dudouit   string oper;
78214cce66cSYohann Dudouit   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
7834d537eeaSYohann 
7844d537eeaSYohann   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
785ee07ded2SValeria Barra   code << "\n#define CeedPragmaSIMD\n";
7861da99368SJeremy L Thompson 
7871da99368SJeremy L Thompson   // Find dim and Q1d
7881da99368SJeremy L Thompson   bool collograd = false;
7891da99368SJeremy L Thompson   for (CeedInt i = 0; i < numinputfields; i++) {
7901da99368SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
7911da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
7921da99368SJeremy L Thompson       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
7931da99368SJeremy L Thompson 
7941da99368SJeremy L Thompson       // Check for collocated gradient
7951da99368SJeremy L Thompson       if (basis_data->d_collograd1d)
7961da99368SJeremy L Thompson         collograd = true;
7971da99368SJeremy L Thompson 
7981da99368SJeremy L Thompson       // Collect dim and Q1d
7991da99368SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
8001da99368SJeremy L Thompson       bool isTensor;
801fd364f38SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
8021da99368SJeremy L Thompson       if (isTensor) {
8031da99368SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
8041da99368SJeremy L Thompson       } else {
805*e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
8061da99368SJeremy L Thompson         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
807*e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8081da99368SJeremy L Thompson         }
8091da99368SJeremy L Thompson     }
8101da99368SJeremy L Thompson   }
8111da99368SJeremy L Thompson   // Check output bases for Q1d, dim as well
8121da99368SJeremy L Thompson   //   The only imput basis might be CEED_BASIS_COLLOCATED
8131da99368SJeremy L Thompson   for (CeedInt i = 0; i < numoutputfields; i++) {
8141da99368SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
8151da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
8161da99368SJeremy L Thompson       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
8171da99368SJeremy L Thompson       // Collect dim and Q1d
8181da99368SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
8191da99368SJeremy L Thompson       bool isTensor;
820fd364f38SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
8211da99368SJeremy L Thompson       if (isTensor) {
8221da99368SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
8231da99368SJeremy L Thompson       } else {
824*e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
8251da99368SJeremy L Thompson         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
826*e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8271da99368SJeremy L Thompson         }
8281da99368SJeremy L Thompson     }
8291da99368SJeremy L Thompson   }
8301da99368SJeremy L Thompson   data->dim = dim;
8311da99368SJeremy L Thompson   data->Q1d = Q1d;
8321da99368SJeremy L Thompson 
8331da99368SJeremy L Thompson   // Define CEED_Q_VLA
8341da99368SJeremy L Thompson   if (dim != 3 || collograd) {
8351da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA 1\n\n";
8361da99368SJeremy L Thompson   } else {
8371da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
8381da99368SJeremy L Thompson   }
8391da99368SJeremy L Thompson 
840241a4b83SYohann   code << qFunction;
841241a4b83SYohann 
842241a4b83SYohann   // Setup
843818e0025SJeremy L Thompson   code << "\n// -----------------------------------------------------------------------------\n";
844c3c443faSYohann Dudouit   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
845241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
846241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
847241a4b83SYohann     CeedChk(ierr);
8481da99368SJeremy L Thompson     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
849241a4b83SYohann       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
850241a4b83SYohann     }
851241a4b83SYohann   }
852241a4b83SYohann 
853241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
854241a4b83SYohann     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
855241a4b83SYohann   }
8561da99368SJeremy L Thompson 
857241a4b83SYohann   code << "  const CeedInt Dim = "<<dim<<";\n";
858241a4b83SYohann   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
8591da99368SJeremy L Thompson 
860241a4b83SYohann   code << "  extern __shared__ CeedScalar slice[];\n";
861241a4b83SYohann   code << "  BackendData data;\n";
862241a4b83SYohann   code << "  data.tidx = threadIdx.x;\n";
863241a4b83SYohann   code << "  data.tidy = threadIdx.y;\n";
864241a4b83SYohann   code << "  data.tidz = threadIdx.z;\n";
865241a4b83SYohann   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
866241a4b83SYohann   code << "  data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
867920dcdc4Sjeremylt 
868818e0025SJeremy L Thompson   code << "\n  // -- Input field constants and basis data --\n";
869ac421f39SYohann   //Initialize constants, and matrices B and G
870241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
871818e0025SJeremy L Thompson     code << "  // ---- Input field "<<i<<" ----\n";
872241a4b83SYohann     // Get elemsize, emode, ncomp
873241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
874241a4b83SYohann     CeedChk(ierr);
875241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
876241a4b83SYohann     CeedChk(ierr);
877241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
878241a4b83SYohann     CeedChk(ierr);
8794d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
880241a4b83SYohann     CeedChk(ierr);
881920dcdc4Sjeremylt 
882920dcdc4Sjeremylt     // Set field constants
883920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT) {
884920dcdc4Sjeremylt       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
885920dcdc4Sjeremylt       if (basis != CEED_BASIS_COLLOCATED) {
886920dcdc4Sjeremylt         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
887920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
888920dcdc4Sjeremylt       } else {
889920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
890920dcdc4Sjeremylt       }
891920dcdc4Sjeremylt       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
892920dcdc4Sjeremylt     }
893920dcdc4Sjeremylt 
894920dcdc4Sjeremylt     // Load basis data
895920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
896241a4b83SYohann     switch (emode) {
897241a4b83SYohann     case CEED_EVAL_NONE:
898241a4b83SYohann       break;
899241a4b83SYohann     case CEED_EVAL_INTERP:
900241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
901241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
902241a4b83SYohann       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
903241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
904241a4b83SYohann       break;
905241a4b83SYohann     case CEED_EVAL_GRAD:
906241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
907241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
908241a4b83SYohann       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
909241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
910ac421f39SYohann       if (basis_data->d_collograd1d) {
911ac421f39SYohann         data->G.in[i] = basis_data->d_collograd1d;
912ac421f39SYohann         code << "  __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
913ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
914ac421f39SYohann       } else {
915ac421f39SYohann         data->G.in[i] = basis_data->d_grad1d;
916ac421f39SYohann         code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
917241a4b83SYohann         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
918ac421f39SYohann       }
919241a4b83SYohann       break;
920241a4b83SYohann     case CEED_EVAL_WEIGHT:
921241a4b83SYohann       break; // No action
922241a4b83SYohann     case CEED_EVAL_DIV:
923241a4b83SYohann       break; // TODO: Not implemented
924241a4b83SYohann     case CEED_EVAL_CURL:
925241a4b83SYohann       break; // TODO: Not implemented
926241a4b83SYohann     }
927241a4b83SYohann   }
928920dcdc4Sjeremylt 
929818e0025SJeremy L Thompson   code << "\n  // -- Output field constants and basis data --\n";
930241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
931818e0025SJeremy L Thompson     code << "  // ---- Output field "<<i<<" ----\n";
932241a4b83SYohann     // Get elemsize, emode, ncomp
933241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
934241a4b83SYohann     CeedChk(ierr);
935241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
936241a4b83SYohann     CeedChk(ierr);
937241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
938241a4b83SYohann     CeedChk(ierr);
9394d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
940241a4b83SYohann     CeedChk(ierr);
941920dcdc4Sjeremylt 
942920dcdc4Sjeremylt     // Set field constants
943241a4b83SYohann     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
944920dcdc4Sjeremylt     if (basis != CEED_BASIS_COLLOCATED) {
945241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
946241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
947920dcdc4Sjeremylt     } else {
948920dcdc4Sjeremylt       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
949920dcdc4Sjeremylt     }
950920dcdc4Sjeremylt     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
951920dcdc4Sjeremylt 
952920dcdc4Sjeremylt     // Load basis data
953920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
954920dcdc4Sjeremylt     switch (emode) {
955920dcdc4Sjeremylt     case CEED_EVAL_NONE:
956920dcdc4Sjeremylt       break; // No action
957920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
958241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
959241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
960241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
961241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
962241a4b83SYohann       break;
963241a4b83SYohann     case CEED_EVAL_GRAD:
964241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
965241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
966241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
967241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
968ac421f39SYohann       if (basis_data->d_collograd1d) {
969ac421f39SYohann         data->G.out[i] = basis_data->d_collograd1d;
970ac421f39SYohann         code << "  __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
971ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
972ac421f39SYohann       } else {
973ac421f39SYohann         data->G.out[i] = basis_data->d_grad1d;
974ac421f39SYohann         code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
975241a4b83SYohann         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
976ac421f39SYohann       }
977241a4b83SYohann       break;
978*e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
979241a4b83SYohann     case CEED_EVAL_WEIGHT: {
980241a4b83SYohann       Ceed ceed;
981241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
982241a4b83SYohann       return CeedError(ceed, 1,
983241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
984241a4b83SYohann       break; // Should not occur
985241a4b83SYohann     }
986241a4b83SYohann     case CEED_EVAL_DIV:
987241a4b83SYohann       break; // TODO: Not implemented
988241a4b83SYohann     case CEED_EVAL_CURL:
989241a4b83SYohann       break; // TODO: Not implemented
990*e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
991241a4b83SYohann     }
992241a4b83SYohann   }
993818e0025SJeremy L Thompson   code << "\n  // -- Element loop --\n";
994ac421f39SYohann   code << "  __syncthreads();\n";
995241a4b83SYohann   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
996241a4b83SYohann   // Input basis apply if needed
997ac421f39SYohann   // Generate the correct eval mode code for each input
998818e0025SJeremy L Thompson   code << "    // -- Input field restrictions and basis actions --\n";
999241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1000818e0025SJeremy L Thompson     code << "    // ---- Input field "<<i<<" ----\n";
1001241a4b83SYohann     // Get elemsize, emode, ncomp
1002241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1003241a4b83SYohann     CeedChk(ierr);
1004241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1005241a4b83SYohann     CeedChk(ierr);
1006241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1007241a4b83SYohann     CeedChk(ierr);
10084d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1009241a4b83SYohann     CeedChk(ierr);
1010920dcdc4Sjeremylt 
1011920dcdc4Sjeremylt     // Restriction
1012920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT &&
1013920dcdc4Sjeremylt         !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) {
1014241a4b83SYohann       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
10159a2291e3SJeremy L Thompson 
10169a2291e3SJeremy L Thompson       bool isStrided;
1017fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
10189a2291e3SJeremy L Thompson       if (!isStrided) {
10195c7b696cSjeremylt         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
10205c7b696cSjeremylt         CeedChk(ierr);
10215c7b696cSjeremylt         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
10225c7b696cSjeremylt         CeedInt compstride;
10235c7b696cSjeremylt         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
10245c7b696cSjeremylt         code << "    // CompStride: "<<compstride<<"\n";
10259a2291e3SJeremy L Thompson         ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
10269a2291e3SJeremy L Thompson         data->indices.in[i] = restr_data->d_ind;
10275c7b696cSjeremylt         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1028ccedf6b0Sjeremylt       } else {
1029920dcdc4Sjeremylt         bool backendstrides;
1030fd364f38SJeremy L Thompson         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1031920dcdc4Sjeremylt         CeedChk(ierr);
103213585805SJeremy L Thompson         CeedInt nelem;
103313585805SJeremy L Thompson         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
103413585805SJeremy L Thompson         CeedChk(ierr);
103513585805SJeremy L Thompson         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1036920dcdc4Sjeremylt         if (!backendstrides) {
1037920dcdc4Sjeremylt           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1038920dcdc4Sjeremylt           CeedChk(ierr);
1039ccedf6b0Sjeremylt         }
1040920dcdc4Sjeremylt         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1041d80fc06aSjeremylt         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1042920dcdc4Sjeremylt       }
1043920dcdc4Sjeremylt     }
1044920dcdc4Sjeremylt 
1045920dcdc4Sjeremylt     // Basis action
1046920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1047920dcdc4Sjeremylt     switch (emode) {
1048920dcdc4Sjeremylt     case CEED_EVAL_NONE:
1049920dcdc4Sjeremylt       if (!basis_data->d_collograd1d) {
1050920dcdc4Sjeremylt         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1051920dcdc4Sjeremylt       }
1052920dcdc4Sjeremylt       break;
1053920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
1054241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1055241a4b83SYohann       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1056241a4b83SYohann       break;
1057241a4b83SYohann     case CEED_EVAL_GRAD:
1058ac421f39SYohann       if (basis_data->d_collograd1d) {
1059ac421f39SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1060ac421f39SYohann         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1061ac421f39SYohann       } else {
1062241a4b83SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1063241a4b83SYohann         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1064ac421f39SYohann       }
1065241a4b83SYohann       break;
1066241a4b83SYohann     case CEED_EVAL_WEIGHT:
1067241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1068241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1069241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1070241a4b83SYohann       data->W = basis_data->d_qweight1d;
1071241a4b83SYohann       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1072241a4b83SYohann       break; // No action
1073241a4b83SYohann     case CEED_EVAL_DIV:
1074241a4b83SYohann       break; // TODO: Not implemented
1075241a4b83SYohann     case CEED_EVAL_CURL:
1076241a4b83SYohann       break; // TODO: Not implemented
1077241a4b83SYohann     }
1078241a4b83SYohann   }
1079ac421f39SYohann 
1080241a4b83SYohann   // Q function
1081818e0025SJeremy L Thompson   code << "\n    // -- Output field setup --\n";
1082241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1083818e0025SJeremy L Thompson       code << "\n    // ---- Output field "<<i<<" ----\n";
1084241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1085241a4b83SYohann     CeedChk(ierr);
1086241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
1087241a4b83SYohann     {
1088ac421f39SYohann       if (basis_data->d_collograd1d) {
1089ac421f39SYohann         //Accumulator for gradient slices
1090ac421f39SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1091ac421f39SYohann         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1092ac421f39SYohann         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1093ac421f39SYohann         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1094ac421f39SYohann         code << "      }\n";
1095ac421f39SYohann         code << "    }\n";
1096ac421f39SYohann       } else {
1097241a4b83SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1098241a4b83SYohann       }
1099ac421f39SYohann     }
1100241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1101241a4b83SYohann     {
1102241a4b83SYohann       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1103241a4b83SYohann     }
1104241a4b83SYohann   }
1105ac421f39SYohann   // We treat quadrature points per slice in 3d to save registers
1106ac421f39SYohann   if (basis_data->d_collograd1d) {
1107920dcdc4Sjeremylt     code << "\n    // Note: Collocated Gradient\n";
1108ac421f39SYohann     code << "#pragma unroll\n";
1109ac421f39SYohann     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1110818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1111ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1112818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1113ac421f39SYohann       // Get elemsize, emode, ncomp
1114ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1115ac421f39SYohann       CeedChk(ierr);
1116ac421f39SYohann       // Basis action
1117920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1118ac421f39SYohann       switch (emode) {
1119ac421f39SYohann       case CEED_EVAL_NONE:
1120ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11219a2291e3SJeremy L Thompson 
11229a2291e3SJeremy L Thompson         bool isStrided;
112313585805SJeremy L Thompson         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr);
1124fd364f38SJeremy L Thompson         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
11259a2291e3SJeremy L Thompson         if (!isStrided) {
11265c7b696cSjeremylt           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
11275c7b696cSjeremylt           CeedChk(ierr);
11285c7b696cSjeremylt           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
11295c7b696cSjeremylt           CeedInt compstride;
11305c7b696cSjeremylt           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
11315c7b696cSjeremylt           code << "      // CompStride: "<<compstride<<"\n";
11329a2291e3SJeremy L Thompson           ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
11339a2291e3SJeremy L Thompson           data->indices.in[i] = restr_data->d_ind;
11345c7b696cSjeremylt           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1135920dcdc4Sjeremylt         } else {
1136920dcdc4Sjeremylt           bool backendstrides;
1137fd364f38SJeremy L Thompson           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1138920dcdc4Sjeremylt           CeedChk(ierr);
113913585805SJeremy L Thompson           CeedInt nelem;
114013585805SJeremy L Thompson           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
114113585805SJeremy L Thompson           CeedChk(ierr);
114213585805SJeremy L Thompson           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1143920dcdc4Sjeremylt           if (!backendstrides) {
1144920dcdc4Sjeremylt             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1145920dcdc4Sjeremylt             CeedChk(ierr);
1146920dcdc4Sjeremylt           }
1147920dcdc4Sjeremylt           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1148d80fc06aSjeremylt           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1149920dcdc4Sjeremylt         }
1150ac421f39SYohann         break;
1151ac421f39SYohann       case CEED_EVAL_INTERP:
1152ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1153ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1154ac421f39SYohann         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1155ac421f39SYohann         code << "      }\n";
1156ac421f39SYohann         break;
1157ac421f39SYohann       case CEED_EVAL_GRAD:
1158ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1159ac421f39SYohann         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1160ac421f39SYohann         break;
1161ac421f39SYohann       case CEED_EVAL_WEIGHT:
1162ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[1];\n";
1163ac421f39SYohann         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1164ac421f39SYohann         break; // No action
1165ac421f39SYohann       case CEED_EVAL_DIV:
1166ac421f39SYohann         break; // TODO: Not implemented
1167ac421f39SYohann       case CEED_EVAL_CURL:
1168ac421f39SYohann         break; // TODO: Not implemented
1169ac421f39SYohann       }
1170ac421f39SYohann     }
1171818e0025SJeremy L Thompson     code << "\n      // -- Output fields --\n";
1172ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1173818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1174ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1175ac421f39SYohann       CeedChk(ierr);
1176ac421f39SYohann       // Basis action
1177ac421f39SYohann       switch (emode) {
1178ac421f39SYohann       case CEED_EVAL_NONE:
1179ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1180ac421f39SYohann         break; // No action
1181ac421f39SYohann       case CEED_EVAL_INTERP:
1182ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1183ac421f39SYohann         break;
1184ac421f39SYohann       case CEED_EVAL_GRAD:
1185ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1186ac421f39SYohann         break;
1187ac421f39SYohann       case CEED_EVAL_WEIGHT:
1188ac421f39SYohann         break; // Should not occur
1189ac421f39SYohann       case CEED_EVAL_DIV:
1190ac421f39SYohann         break; // TODO: Not implemented
1191ac421f39SYohann       case CEED_EVAL_CURL:
1192ac421f39SYohann         break; // TODO: Not implemented
1193ac421f39SYohann       }
1194ac421f39SYohann     }
1195ac421f39SYohann   } else {
1196920dcdc4Sjeremylt     code << "\n      // Note: No Collocated Gradient\n";
1197818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1198ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1199818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1200ac421f39SYohann       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1201ac421f39SYohann     }
1202818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1203ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1204818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1205ac421f39SYohann       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1206ac421f39SYohann     }
1207ac421f39SYohann   }
1208818e0025SJeremy L Thompson   code << "\n      // -- QFunction Inputs and outputs --\n";
12094d537eeaSYohann   code << "      CeedScalar* in["<<numinputfields<<"];\n";
12104d537eeaSYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1211818e0025SJeremy L Thompson     code << "      // ---- Input field "<<i<<" ----\n";
1212ac421f39SYohann     code << "      in["<<i<<"] = r_q"<<i<<";\n";
12134d537eeaSYohann   }
12144d537eeaSYohann   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
12154d537eeaSYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1216818e0025SJeremy L Thompson     code << "      // ---- Output field "<<i<<" ----\n";
1217ac421f39SYohann     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
12184d537eeaSYohann   }
1219818e0025SJeremy L Thompson   code << "\n      // -- Apply QFunction --\n";
1220ac421f39SYohann   code << "      "<<qFunctionName<<"(ctx, ";
1221ac421f39SYohann   if (dim != 3 || basis_data->d_collograd1d) {
1222ac421f39SYohann     code << "1";
1223ac421f39SYohann   } else {
1224ac421f39SYohann     code << "Q1d";
1225ac421f39SYohann   }
1226ac421f39SYohann   code << ", in, out);\n";
1227ac421f39SYohann   if (basis_data->d_collograd1d) {
1228920dcdc4Sjeremylt     code << "\n      // Note: Collocated Gradient\n";
1229818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1230ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1231818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1232ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1233ac421f39SYohann       CeedChk(ierr);
1234ac421f39SYohann       // Basis action
1235920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1236ac421f39SYohann       switch (emode) {
1237ac421f39SYohann       case CEED_EVAL_NONE:
1238ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1239ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1240ac421f39SYohann         code << "      }\n";
1241ac421f39SYohann         break; // No action
1242ac421f39SYohann       case CEED_EVAL_INTERP:
1243ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1244ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1245ac421f39SYohann         code << "      }\n";
1246ac421f39SYohann         break;
1247ac421f39SYohann       case CEED_EVAL_GRAD:
1248ac421f39SYohann         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1249ac421f39SYohann         break;
1250ac421f39SYohann       case CEED_EVAL_WEIGHT:
1251ac421f39SYohann         break; // Should not occur
1252ac421f39SYohann       case CEED_EVAL_DIV:
1253ac421f39SYohann         break; // TODO: Not implemented
1254ac421f39SYohann       case CEED_EVAL_CURL:
1255ac421f39SYohann         break; // TODO: Not implemented
1256ac421f39SYohann       }
1257ac421f39SYohann     }
1258ac421f39SYohann     code << "    }\n";
1259ac421f39SYohann   }
1260241a4b83SYohann 
1261241a4b83SYohann   // Output basis apply if needed
1262ac421f39SYohann   // Generate the correct eval mode code for each output
1263818e0025SJeremy L Thompson   code << "\n    // -- Output field basis action and restrictions --\n";
1264241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1265818e0025SJeremy L Thompson     code << "    // ---- Output field "<<i<<" ----\n";
1266241a4b83SYohann     // Get elemsize, emode, ncomp
1267241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1268241a4b83SYohann     CeedChk(ierr);
1269241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1270241a4b83SYohann     CeedChk(ierr);
1271241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1272241a4b83SYohann     CeedChk(ierr);
12734d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1274241a4b83SYohann     CeedChk(ierr);
1275241a4b83SYohann     // Basis action
1276920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1277241a4b83SYohann     switch (emode) {
1278241a4b83SYohann     case CEED_EVAL_NONE:
1279920dcdc4Sjeremylt       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1280241a4b83SYohann       break; // No action
1281241a4b83SYohann     case CEED_EVAL_INTERP:
1282241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1283241a4b83SYohann       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1284241a4b83SYohann       break;
1285241a4b83SYohann     case CEED_EVAL_GRAD:
1286241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1287ac421f39SYohann       if (basis_data->d_collograd1d) {
1288ac421f39SYohann         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1289ac421f39SYohann       } else {
1290241a4b83SYohann         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1291ac421f39SYohann       }
1292241a4b83SYohann       break;
1293*e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
1294241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1295241a4b83SYohann       Ceed ceed;
1296241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1297241a4b83SYohann       return CeedError(ceed, 1,
1298241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1299241a4b83SYohann       break; // Should not occur
1300241a4b83SYohann     }
1301241a4b83SYohann     case CEED_EVAL_DIV:
1302241a4b83SYohann       break; // TODO: Not implemented
1303241a4b83SYohann     case CEED_EVAL_CURL:
1304241a4b83SYohann       break; // TODO: Not implemented
1305*e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
1306241a4b83SYohann     }
1307920dcdc4Sjeremylt     // Restriction
13089a2291e3SJeremy L Thompson       bool isStrided;
1309fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
13109a2291e3SJeremy L Thompson     if (!isStrided) {
13115c7b696cSjeremylt       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
13125c7b696cSjeremylt       CeedChk(ierr);
13135c7b696cSjeremylt       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
13145c7b696cSjeremylt       CeedInt compstride;
13155c7b696cSjeremylt       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
13165c7b696cSjeremylt       code << "    // CompStride: "<<compstride<<"\n";
13179a2291e3SJeremy L Thompson       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
13189a2291e3SJeremy L Thompson       data->indices.out[i] = restr_data->d_ind;
13195c7b696cSjeremylt       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1320920dcdc4Sjeremylt     } else {
1321920dcdc4Sjeremylt       bool backendstrides;
1322fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1323920dcdc4Sjeremylt       CeedChk(ierr);
132413585805SJeremy L Thompson       CeedInt nelem;
132513585805SJeremy L Thompson       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
132613585805SJeremy L Thompson       CeedChk(ierr);
132713585805SJeremy L Thompson       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1328920dcdc4Sjeremylt       if (!backendstrides) {
1329920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1330920dcdc4Sjeremylt         CeedChk(ierr);
1331920dcdc4Sjeremylt       }
1332920dcdc4Sjeremylt       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1333d80fc06aSjeremylt       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1334920dcdc4Sjeremylt     }
1335241a4b83SYohann   }
1336241a4b83SYohann 
1337241a4b83SYohann   code << "  }\n";
1338818e0025SJeremy L Thompson   code << "}\n";
1339818e0025SJeremy L Thompson   code << "// -----------------------------------------------------------------------------\n\n";
1340241a4b83SYohann 
1341ab213215SJeremy L Thompson   // View kernel for debugging
1342b8e71988SJeremy L Thompson   CeedDebug(code.str().c_str());
1343241a4b83SYohann 
1344920dcdc4Sjeremylt   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0);
1345920dcdc4Sjeremylt   CeedChk(ierr);
1346c3c443faSYohann Dudouit   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1347241a4b83SYohann   CeedChk(ierr);
1348241a4b83SYohann 
1349241a4b83SYohann   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1350241a4b83SYohann   return 0;
1351241a4b83SYohann }
1352ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1353