xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 1d47fde22da38d209d144c95b0649ec760f07c05)
13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3241a4b83SYohann //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5241a4b83SYohann //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
73d576824SJeremy L Thompson 
8b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12
97df94212SJeremy L Thompson 
10ec3da8bcSJed Brown #include <ceed/ceed.h>
11ec3da8bcSJed Brown #include <ceed/backend.h>
123d576824SJeremy L Thompson #include <cuda_runtime.h>
13241a4b83SYohann #include <iostream>
14241a4b83SYohann #include <sstream>
153d576824SJeremy L Thompson #include "ceed-cuda-gen.h"
166d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h"
170d0321e0SJeremy L Thompson #include "../cuda-ref/ceed-cuda-ref.h"
18241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
19241a4b83SYohann 
20f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
21ab213215SJeremy L Thompson //------------------------------------------------------------------------------
22ab213215SJeremy L Thompson // Atomic add, for older CUDA
23ab213215SJeremy L Thompson //------------------------------------------------------------------------------
2480a9ef05SNatalie Beams __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) {
25241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
26241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
27241a4b83SYohann   do {
28241a4b83SYohann     assumed = old;
29241a4b83SYohann     old =
30241a4b83SYohann       atomicCAS(address_as_ull, assumed,
31241a4b83SYohann                 __double_as_longlong(val +
32241a4b83SYohann                                     __longlong_as_double(assumed)));
33241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
34241a4b83SYohann     // (since NaN != NaN)
35241a4b83SYohann   } while (assumed != old);
36241a4b83SYohann   return __longlong_as_double(old);
37241a4b83SYohann }
38f1a13f77SYohann Dudouit );
39f1a13f77SYohann Dudouit 
40f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
41f1a13f77SYohann Dudouit 
42ab213215SJeremy L Thompson //------------------------------------------------------------------------------
43ab213215SJeremy L Thompson // Typedefs
44ab213215SJeremy L Thompson //------------------------------------------------------------------------------
45f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
46f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
47f1a13f77SYohann Dudouit 
48f1a13f77SYohann Dudouit typedef struct {
49f1a13f77SYohann Dudouit   CeedInt tidx;
50f1a13f77SYohann Dudouit   CeedInt tidy;
51f1a13f77SYohann Dudouit   CeedInt tidz;
52f1a13f77SYohann Dudouit   CeedInt tid;
53f1a13f77SYohann Dudouit   CeedScalar* slice;
54f1a13f77SYohann Dudouit } BackendData;
55241a4b83SYohann 
56ab213215SJeremy L Thompson //------------------------------------------------------------------------------
57ab213215SJeremy L Thompson // Load matrices for basis actions
58ab213215SJeremy L Thompson //------------------------------------------------------------------------------
59241a4b83SYohann template <int P, int Q>
60053543deSYohann Dudouit inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
61ab213215SJeremy L Thompson   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
62241a4b83SYohann     B[i] = d_B[i];
63241a4b83SYohann }
64241a4b83SYohann 
65ab213215SJeremy L Thompson //------------------------------------------------------------------------------
66241a4b83SYohann // 1D
67ab213215SJeremy L Thompson //------------------------------------------------------------------------------
68ab213215SJeremy L Thompson 
69ab213215SJeremy L Thompson //------------------------------------------------------------------------------
70ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
71ab213215SJeremy L Thompson //------------------------------------------------------------------------------
725c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
73053543deSYohann Dudouit inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
74ab213215SJeremy L Thompson   if (data.tidx < P1d) {
754d537eeaSYohann     const CeedInt node = data.tidx;
76ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
77ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
785c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
79241a4b83SYohann   }
80241a4b83SYohann }
81920dcdc4Sjeremylt 
82ab213215SJeremy L Thompson //------------------------------------------------------------------------------
83ab213215SJeremy L Thompson // L-vector -> E-vector, strided
84ab213215SJeremy L Thompson //------------------------------------------------------------------------------
85d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
86053543deSYohann Dudouit inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
87ab213215SJeremy L Thompson   if (data.tidx < P1d) {
88ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
89d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
90ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
91d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
92ccedf6b0Sjeremylt   }
93ccedf6b0Sjeremylt }
94241a4b83SYohann 
95ab213215SJeremy L Thompson //------------------------------------------------------------------------------
96ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
97ab213215SJeremy L Thompson //------------------------------------------------------------------------------
985c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
99053543deSYohann Dudouit inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
100ab213215SJeremy L Thompson   if (data.tidx < P1d) {
1014d537eeaSYohann     const CeedInt node = data.tidx;
102ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
103ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1045c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
105241a4b83SYohann   }
106241a4b83SYohann }
107241a4b83SYohann 
108ab213215SJeremy L Thompson //------------------------------------------------------------------------------
109ab213215SJeremy L Thompson // E-vector -> L-vector, strided
110ab213215SJeremy L Thompson //------------------------------------------------------------------------------
111d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
112d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
113ab213215SJeremy L Thompson   if (data.tidx < P1d) {
114ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
115d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
116ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
117d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
118ccedf6b0Sjeremylt   }
119ccedf6b0Sjeremylt }
120ccedf6b0Sjeremylt 
121ab213215SJeremy L Thompson //------------------------------------------------------------------------------
122ab213215SJeremy L Thompson // 1D tensor contraction x
123ab213215SJeremy L Thompson //------------------------------------------------------------------------------
124241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
125ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
126241a4b83SYohann   data.slice[data.tidx] = *U;
127241a4b83SYohann   __syncthreads();
128241a4b83SYohann   *V = 0.0;
12918d499f1SYohann   if (data.tidx < Q1d)
130ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
131ab213215SJeremy L Thompson       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
132241a4b83SYohann   __syncthreads();
133241a4b83SYohann }
134241a4b83SYohann 
135ab213215SJeremy L Thompson //------------------------------------------------------------------------------
136ab213215SJeremy L Thompson // 1D transpose tensor contraction x
137ab213215SJeremy L Thompson //------------------------------------------------------------------------------
138241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
139ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
140241a4b83SYohann   data.slice[data.tidx] = *U;
141241a4b83SYohann   __syncthreads();
142241a4b83SYohann   *V = 0.0;
14318d499f1SYohann   if (data.tidx < P1d)
144ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
145ab213215SJeremy L Thompson       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
146241a4b83SYohann   __syncthreads();
147241a4b83SYohann }
148241a4b83SYohann 
149ab213215SJeremy L Thompson //------------------------------------------------------------------------------
150ab213215SJeremy L Thompson // 1D interpolate to quadrature points
151ab213215SJeremy L Thompson //------------------------------------------------------------------------------
152241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
153ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
154ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
155241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
156241a4b83SYohann }
157241a4b83SYohann 
158ab213215SJeremy L Thompson //------------------------------------------------------------------------------
159ab213215SJeremy L Thompson // 1D interpolate transpose
160ab213215SJeremy L Thompson //------------------------------------------------------------------------------
161241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
162ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
163ab213215SJeremy L Thompson   for (CeedInt comp=0; comp<NCOMP; comp++)
164241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
165241a4b83SYohann }
166241a4b83SYohann 
167ab213215SJeremy L Thompson //------------------------------------------------------------------------------
168ab213215SJeremy L Thompson // 1D derivatives at quadrature points
169ab213215SJeremy L Thompson //------------------------------------------------------------------------------
170241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
171ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
172ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
173241a4b83SYohann     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
174241a4b83SYohann }
175241a4b83SYohann 
176ab213215SJeremy L Thompson //------------------------------------------------------------------------------
177ab213215SJeremy L Thompson // 1D derivatives transpose
178ab213215SJeremy L Thompson //------------------------------------------------------------------------------
179241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
180ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
181ab213215SJeremy L Thompson   for (CeedInt comp = 0; comp < NCOMP; comp++)
182241a4b83SYohann     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
183241a4b83SYohann }
184241a4b83SYohann 
185ab213215SJeremy L Thompson //------------------------------------------------------------------------------
186241a4b83SYohann // 2D
187ab213215SJeremy L Thompson //------------------------------------------------------------------------------
188ab213215SJeremy L Thompson 
189ab213215SJeremy L Thompson //------------------------------------------------------------------------------
190ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
191ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1925c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
193053543deSYohann Dudouit inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
194ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
1954d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
196ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
197ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1985c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
199241a4b83SYohann   }
200241a4b83SYohann }
201920dcdc4Sjeremylt 
202ab213215SJeremy L Thompson //------------------------------------------------------------------------------
203ab213215SJeremy L Thompson // L-vector -> E-vector, strided
204ab213215SJeremy L Thompson //------------------------------------------------------------------------------
205d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
206053543deSYohann Dudouit inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
207ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
208ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
209d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
210ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
211d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
212ccedf6b0Sjeremylt   }
213ccedf6b0Sjeremylt }
214241a4b83SYohann 
215ab213215SJeremy L Thompson //------------------------------------------------------------------------------
216ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
217ab213215SJeremy L Thompson //------------------------------------------------------------------------------
2185c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
219053543deSYohann Dudouit inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
220ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
2214d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
222ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
223ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2245c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
225241a4b83SYohann   }
226241a4b83SYohann }
227241a4b83SYohann 
228ab213215SJeremy L Thompson //------------------------------------------------------------------------------
229ab213215SJeremy L Thompson // E-vector -> L-vector, strided
230ab213215SJeremy L Thompson //------------------------------------------------------------------------------
231d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
232d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
233ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d) {
234ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
235d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
236ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
237d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
238ccedf6b0Sjeremylt   }
239ccedf6b0Sjeremylt }
240ccedf6b0Sjeremylt 
241ab213215SJeremy L Thompson //------------------------------------------------------------------------------
242ab213215SJeremy L Thompson // 2D tensor contraction x
243ab213215SJeremy L Thompson //------------------------------------------------------------------------------
244241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
245ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
24618d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
247241a4b83SYohann   __syncthreads();
248241a4b83SYohann   *V = 0.0;
24918d499f1SYohann   if (data.tidx < Q1d && data.tidy < P1d)
250ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
25118d499f1SYohann       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
252241a4b83SYohann   __syncthreads();
253241a4b83SYohann }
254241a4b83SYohann 
255ab213215SJeremy L Thompson //------------------------------------------------------------------------------
256ab213215SJeremy L Thompson // 2D tensor contract y
257ab213215SJeremy L Thompson //------------------------------------------------------------------------------
258241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
259ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
26018d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
261241a4b83SYohann   __syncthreads();
262241a4b83SYohann   *V = 0.0;
26318d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d)
264ab213215SJeremy L Thompson     for (CeedInt i = 0; i < P1d; ++i)
26518d499f1SYohann       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
266241a4b83SYohann   __syncthreads();
267241a4b83SYohann }
268241a4b83SYohann 
269ab213215SJeremy L Thompson //------------------------------------------------------------------------------
270ab213215SJeremy L Thompson // 2D transpose tensor contract y
271ab213215SJeremy L Thompson //------------------------------------------------------------------------------
272241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
273ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
27418d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
275241a4b83SYohann   __syncthreads();
276241a4b83SYohann   *V = 0.0;
27718d499f1SYohann   if (data.tidx < Q1d && data.tidy < P1d)
278ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
27918d499f1SYohann       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
280241a4b83SYohann   __syncthreads();
281241a4b83SYohann }
282241a4b83SYohann 
283ab213215SJeremy L Thompson //------------------------------------------------------------------------------
284ab213215SJeremy L Thompson // 2D transpose tensor contract x
285ab213215SJeremy L Thompson //------------------------------------------------------------------------------
286241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
287ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
28818d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
289241a4b83SYohann   __syncthreads();
290241a4b83SYohann   *V = 0.0;
29118d499f1SYohann   if (data.tidx < P1d && data.tidy < P1d)
292ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
29318d499f1SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
294241a4b83SYohann   __syncthreads();
295241a4b83SYohann }
296241a4b83SYohann 
297ab213215SJeremy L Thompson //------------------------------------------------------------------------------
298ab213215SJeremy L Thompson // 2D transpose tensor contract and add x
299ab213215SJeremy L Thompson //------------------------------------------------------------------------------
300241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
301ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
30218d499f1SYohann   data.slice[data.tidx+data.tidy*T1d] = *U;
303241a4b83SYohann   __syncthreads();
30418d499f1SYohann   if (data.tidx < P1d && data.tidy < P1d)
305ab213215SJeremy L Thompson     for (CeedInt i = 0; i < Q1d; ++i)
30618d499f1SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
307241a4b83SYohann   __syncthreads();
308241a4b83SYohann }
309241a4b83SYohann 
310ab213215SJeremy L Thompson //------------------------------------------------------------------------------
311ab213215SJeremy L Thompson // 2D interpolate to quadrature points
312ab213215SJeremy L Thompson //------------------------------------------------------------------------------
313241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
314ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
315241a4b83SYohann   CeedScalar r_t[1];
316ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
317241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
318241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
319241a4b83SYohann   }
320241a4b83SYohann }
321241a4b83SYohann 
322ab213215SJeremy L Thompson //------------------------------------------------------------------------------
323ab213215SJeremy L Thompson // 2D interpolate transpose
324ab213215SJeremy L Thompson //------------------------------------------------------------------------------
325241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
326ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
327241a4b83SYohann   CeedScalar r_t[1];
328ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
329241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
330241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
331241a4b83SYohann   }
332241a4b83SYohann }
333241a4b83SYohann 
334ab213215SJeremy L Thompson //------------------------------------------------------------------------------
335ab213215SJeremy L Thompson // 2D derivatives at quadrature points
336ab213215SJeremy L Thompson //------------------------------------------------------------------------------
337241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
338ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
339241a4b83SYohann   CeedScalar r_t[1];
340ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
341241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
342241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
343241a4b83SYohann     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
344241a4b83SYohann     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
345241a4b83SYohann   }
346241a4b83SYohann }
347241a4b83SYohann 
348ab213215SJeremy L Thompson //------------------------------------------------------------------------------
349ab213215SJeremy L Thompson // 2D derivatives transpose
350ab213215SJeremy L Thompson //------------------------------------------------------------------------------
351241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
352ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
353241a4b83SYohann   CeedScalar r_t[1];
354ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
355241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
356241a4b83SYohann     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
357241a4b83SYohann     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
358241a4b83SYohann     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
359241a4b83SYohann   }
360241a4b83SYohann }
361241a4b83SYohann 
362ab213215SJeremy L Thompson //------------------------------------------------------------------------------
363241a4b83SYohann // 3D
364ab213215SJeremy L Thompson //------------------------------------------------------------------------------
365ab213215SJeremy L Thompson 
366ab213215SJeremy L Thompson //------------------------------------------------------------------------------
367ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided
368ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3695c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
370053543deSYohann Dudouit inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
371ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
372241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
3734d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
374ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
375ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3765c7b696cSjeremylt         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
377241a4b83SYohann     }
378241a4b83SYohann }
379920dcdc4Sjeremylt 
380ab213215SJeremy L Thompson //------------------------------------------------------------------------------
381ab213215SJeremy L Thompson // L-vector -> E-vector, strided
382ab213215SJeremy L Thompson //------------------------------------------------------------------------------
383d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
384053543deSYohann Dudouit inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
385ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
386ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
387ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
388d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
389ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
390d80fc06aSjeremylt         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
391ccedf6b0Sjeremylt     }
392ccedf6b0Sjeremylt }
393241a4b83SYohann 
394ab213215SJeremy L Thompson //------------------------------------------------------------------------------
395ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided
396ab213215SJeremy L Thompson //------------------------------------------------------------------------------
3975c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d>
398053543deSYohann Dudouit inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
39918d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
400ac421f39SYohann     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
401920dcdc4Sjeremylt     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
402ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4035c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
404ac421f39SYohann   }
40518d499f1SYohann }
406ac421f39SYohann 
407ab213215SJeremy L Thompson //------------------------------------------------------------------------------
408ab213215SJeremy L Thompson // E-vector -> Q-vector, strided
409ab213215SJeremy L Thompson //------------------------------------------------------------------------------
410d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
411053543deSYohann Dudouit inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
41218d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
413920dcdc4Sjeremylt     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
414d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
415ab213215SJeremy L Thompson     for (CeedInt comp = 0; comp < NCOMP; ++comp)
416d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
417920dcdc4Sjeremylt   }
41818d499f1SYohann }
419920dcdc4Sjeremylt 
420ab213215SJeremy L Thompson //------------------------------------------------------------------------------
421ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided
422ab213215SJeremy L Thompson //------------------------------------------------------------------------------
4235c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
424053543deSYohann Dudouit inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
425ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
426241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4274d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
428ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
429ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4305c7b696cSjeremylt         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
431241a4b83SYohann     }
432241a4b83SYohann }
433241a4b83SYohann 
434ab213215SJeremy L Thompson //------------------------------------------------------------------------------
435ab213215SJeremy L Thompson // E-vector -> L-vector, strided
436ab213215SJeremy L Thompson //------------------------------------------------------------------------------
437d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4382f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
439ab213215SJeremy L Thompson   if (data.tidx < P1d && data.tidy < P1d)
440ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
441ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
442d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
443ab213215SJeremy L Thompson       for (CeedInt comp = 0; comp < NCOMP; ++comp)
444d80fc06aSjeremylt         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
445ccedf6b0Sjeremylt     }
446ccedf6b0Sjeremylt }
447ccedf6b0Sjeremylt 
448ab213215SJeremy L Thompson //------------------------------------------------------------------------------
449ab213215SJeremy L Thompson // 3D tensor contract x
450ab213215SJeremy L Thompson //------------------------------------------------------------------------------
451241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
452ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
453ac421f39SYohann   CeedScalar r_B[P1d];
454ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
455ac421f39SYohann     r_B[i] = B[i + data.tidx*P1d];
456ab213215SJeremy L Thompson 
457ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
45818d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
459241a4b83SYohann     __syncthreads();
460241a4b83SYohann     V[k] = 0.0;
46118d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
462ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
46318d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
464241a4b83SYohann     __syncthreads();
465241a4b83SYohann   }
466241a4b83SYohann }
467241a4b83SYohann 
468ab213215SJeremy L Thompson //------------------------------------------------------------------------------
469ab213215SJeremy L Thompson // 3D tensor contract y
470ab213215SJeremy L Thompson //------------------------------------------------------------------------------
471241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
472ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
473ac421f39SYohann   CeedScalar r_B[P1d];
474ab213215SJeremy L Thompson   for (CeedInt i = 0; i < P1d; ++i)
475ac421f39SYohann     r_B[i] = B[i + data.tidy*P1d];
476ab213215SJeremy L Thompson 
477ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
47818d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
479241a4b83SYohann     __syncthreads();
480241a4b83SYohann     V[k] = 0.0;
48118d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
482ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
48318d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
484241a4b83SYohann     __syncthreads();
485241a4b83SYohann   }
486241a4b83SYohann }
487241a4b83SYohann 
488ab213215SJeremy L Thompson //------------------------------------------------------------------------------
489ab213215SJeremy L Thompson // 3D tensor contract z
490ab213215SJeremy L Thompson //------------------------------------------------------------------------------
491241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
492ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
493ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
494241a4b83SYohann     V[k] = 0.0;
49518d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
496ab213215SJeremy L Thompson       for (CeedInt i = 0; i < P1d; ++i)
497ab213215SJeremy L Thompson         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
498241a4b83SYohann   }
499241a4b83SYohann }
500241a4b83SYohann 
501ab213215SJeremy L Thompson //------------------------------------------------------------------------------
502ab213215SJeremy L Thompson // 3D transpose tensor contract z
503ab213215SJeremy L Thompson //------------------------------------------------------------------------------
504241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
505ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
50618d499f1SYohann   for (CeedInt k = 0; k < P1d; ++k) {
507241a4b83SYohann     V[k] = 0.0;
50818d499f1SYohann     if (data.tidx < Q1d && data.tidy < Q1d)
509ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
510ab213215SJeremy L Thompson         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
511241a4b83SYohann   }
512241a4b83SYohann }
513241a4b83SYohann 
514ab213215SJeremy L Thompson //------------------------------------------------------------------------------
515ab213215SJeremy L Thompson // 3D transpose tensor contract y
516ab213215SJeremy L Thompson //------------------------------------------------------------------------------
517241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
518ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
519ac421f39SYohann   CeedScalar r_B[Q1d];
520ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i)
521ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
522ab213215SJeremy L Thompson 
523ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
52418d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
525241a4b83SYohann     __syncthreads();
526241a4b83SYohann     V[k] = 0.0;
52718d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
528ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
52918d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
530ac421f39SYohann     __syncthreads();
531ac421f39SYohann   }
532ac421f39SYohann }
533ac421f39SYohann 
534ab213215SJeremy L Thompson //------------------------------------------------------------------------------
535ab213215SJeremy L Thompson // 3D transpose tensor contract add y
536ab213215SJeremy L Thompson //------------------------------------------------------------------------------
537ac421f39SYohann template <int NCOMP, int P1d, int Q1d>
538ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
539ac421f39SYohann   CeedScalar r_B[Q1d];
540ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
541ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
542ab213215SJeremy L Thompson 
543ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
54418d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
545ac421f39SYohann     __syncthreads();
54618d499f1SYohann     if (data.tidx < Q1d && data.tidy < P1d)
547ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
54818d499f1SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
549241a4b83SYohann     __syncthreads();
550241a4b83SYohann   }
551241a4b83SYohann }
552241a4b83SYohann 
553ab213215SJeremy L Thompson //------------------------------------------------------------------------------
554ab213215SJeremy L Thompson // 3D transpose tensor contract x
555ab213215SJeremy L Thompson //------------------------------------------------------------------------------
556241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
557ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
558ac421f39SYohann   CeedScalar r_B[Q1d];
559ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
560ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
561ab213215SJeremy L Thompson 
562ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
56318d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
564241a4b83SYohann     __syncthreads();
565241a4b83SYohann     V[k] = 0.0;
56618d499f1SYohann     if (data.tidx < P1d && data.tidy < P1d)
567ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
56818d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
569241a4b83SYohann     __syncthreads();
570241a4b83SYohann   }
571241a4b83SYohann }
572241a4b83SYohann 
573ab213215SJeremy L Thompson //------------------------------------------------------------------------------
574ab213215SJeremy L Thompson // 3D transpose tensor contract add x
575ab213215SJeremy L Thompson //------------------------------------------------------------------------------
576241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
577ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
578ac421f39SYohann   CeedScalar r_B[Q1d];
579ab213215SJeremy L Thompson   for (CeedInt i = 0; i < Q1d; ++i)
580ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
581ab213215SJeremy L Thompson 
582ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
58318d499f1SYohann     data.slice[data.tidx+data.tidy*T1d] = U[k];
584241a4b83SYohann     __syncthreads();
58518d499f1SYohann     if (data.tidx < P1d && data.tidy < P1d)
586ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
58718d499f1SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
588241a4b83SYohann     __syncthreads();
589241a4b83SYohann   }
590241a4b83SYohann }
591241a4b83SYohann 
592ab213215SJeremy L Thompson //------------------------------------------------------------------------------
593ab213215SJeremy L Thompson // 3D interpolate to quadrature points
594ab213215SJeremy L Thompson //------------------------------------------------------------------------------
595241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
596ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
59718d499f1SYohann   CeedScalar r_t1[T1d];
59818d499f1SYohann   CeedScalar r_t2[T1d];
599ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
600241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
601241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
602241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
603241a4b83SYohann   }
604241a4b83SYohann }
605241a4b83SYohann 
606ab213215SJeremy L Thompson //------------------------------------------------------------------------------
607ab213215SJeremy L Thompson // 3D interpolate transpose
608ab213215SJeremy L Thompson //------------------------------------------------------------------------------
609241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
610ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
61118d499f1SYohann   CeedScalar r_t1[T1d];
61218d499f1SYohann   CeedScalar r_t2[T1d];
613ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
614241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
615241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
616241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
617241a4b83SYohann   }
618241a4b83SYohann }
619241a4b83SYohann 
620ab213215SJeremy L Thompson //------------------------------------------------------------------------------
621ab213215SJeremy L Thompson // 3D derivatives at quadrature points
622ab213215SJeremy L Thompson //------------------------------------------------------------------------------
623241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
624ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
62518d499f1SYohann   CeedScalar r_t1[T1d];
62618d499f1SYohann   CeedScalar r_t2[T1d];
627ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
628241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
629241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
630241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
631241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
632241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
633241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
634241a4b83SYohann     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
635241a4b83SYohann     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
636241a4b83SYohann     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
637241a4b83SYohann   }
638241a4b83SYohann }
639241a4b83SYohann 
640ab213215SJeremy L Thompson //------------------------------------------------------------------------------
641ab213215SJeremy L Thompson // 3D derivatives transpose
642ab213215SJeremy L Thompson //------------------------------------------------------------------------------
643241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
644ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
64518d499f1SYohann   CeedScalar r_t1[T1d];
64618d499f1SYohann   CeedScalar r_t2[T1d];
647ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; comp++) {
648241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
649241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
650241a4b83SYohann     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
651241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
652241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
653241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
654241a4b83SYohann     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
655241a4b83SYohann     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
656241a4b83SYohann     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
657241a4b83SYohann   }
658241a4b83SYohann }
659241a4b83SYohann 
660ab213215SJeremy L Thompson //------------------------------------------------------------------------------
661ab213215SJeremy L Thompson // 3D collocated derivatives computation
662ab213215SJeremy L Thompson //------------------------------------------------------------------------------
663ac421f39SYohann template <int NCOMP, int Q1d>
664ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
66518d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
666ac421f39SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
66718d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
668ac421f39SYohann       __syncthreads();
669ac421f39SYohann       // X derivative
670ac421f39SYohann       r_V[comp+0*NCOMP] = 0.0;
671ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
67218d499f1SYohann         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
673ac421f39SYohann       // Y derivative
674ac421f39SYohann       r_V[comp+1*NCOMP] = 0.0;
675ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
67618d499f1SYohann         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
677ac421f39SYohann       // Z derivative
678ac421f39SYohann       r_V[comp+2*NCOMP] = 0.0;
679ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
680ab213215SJeremy L Thompson         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
681ac421f39SYohann       __syncthreads();
682ac421f39SYohann     }
683ac421f39SYohann   }
68418d499f1SYohann }
685ac421f39SYohann 
686ab213215SJeremy L Thompson //------------------------------------------------------------------------------
687ab213215SJeremy L Thompson // 3D collocated derivatives transpose
688ab213215SJeremy L Thompson //------------------------------------------------------------------------------
689ac421f39SYohann template <int NCOMP, int Q1d>
690ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
69118d499f1SYohann   if (data.tidx < Q1d && data.tidy < Q1d) {
692ac421f39SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
693ac421f39SYohann       // X derivative
69418d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
695ac421f39SYohann       __syncthreads();
696ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
69718d499f1SYohann         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
698ac421f39SYohann       __syncthreads();
699ac421f39SYohann       // Y derivative
70018d499f1SYohann       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
701ac421f39SYohann       __syncthreads();
702ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
70318d499f1SYohann         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
704ac421f39SYohann       __syncthreads();
705ac421f39SYohann       // Z derivative
706ab213215SJeremy L Thompson       for (CeedInt i = 0; i < Q1d; ++i)
707ac421f39SYohann         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
708ac421f39SYohann     }
709ac421f39SYohann   }
71018d499f1SYohann }
711ac421f39SYohann 
712ab213215SJeremy L Thompson //------------------------------------------------------------------------------
713ab213215SJeremy L Thompson // 1D quadrature weights
714ab213215SJeremy L Thompson //------------------------------------------------------------------------------
715241a4b83SYohann template <int Q1d>
716053543deSYohann Dudouit inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
71718d499f1SYohann   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
718241a4b83SYohann }
719241a4b83SYohann 
720ab213215SJeremy L Thompson //------------------------------------------------------------------------------
721ab213215SJeremy L Thompson // 2D quadrature weights
722ab213215SJeremy L Thompson //------------------------------------------------------------------------------
723241a4b83SYohann template <int Q1d>
724053543deSYohann Dudouit inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
72518d499f1SYohann   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
72618d499f1SYohann         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
727241a4b83SYohann }
728241a4b83SYohann 
729ab213215SJeremy L Thompson //------------------------------------------------------------------------------
730ab213215SJeremy L Thompson // 3D quadrature weights
731ab213215SJeremy L Thompson //------------------------------------------------------------------------------
732241a4b83SYohann template <int Q1d>
733053543deSYohann Dudouit inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
73418d499f1SYohann   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
73518d499f1SYohann   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
736ac421f39SYohann   for (CeedInt z = 0; z < Q1d; ++z)
73718d499f1SYohann     w[z] = quad ? pw*qweight1d[z] : 0.0;
738241a4b83SYohann }
739241a4b83SYohann 
740241a4b83SYohann );
741ab213215SJeremy L Thompson //------------------------------------------------------------------------------
742ab213215SJeremy L Thompson // Build singe operator kernel
743ab213215SJeremy L Thompson //------------------------------------------------------------------------------
744241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
745241a4b83SYohann 
746241a4b83SYohann   using std::ostringstream;
747241a4b83SYohann   using std::string;
748241a4b83SYohann   int ierr;
749241a4b83SYohann   bool setupdone;
750e15f9bd0SJeremy L Thompson   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
751e15f9bd0SJeremy L Thompson   if (setupdone) return CEED_ERROR_SUCCESS;
752241a4b83SYohann   Ceed ceed;
753e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
754241a4b83SYohann   CeedOperator_Cuda_gen *data;
755e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
756241a4b83SYohann   CeedQFunction qf;
757241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
758e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
759e15f9bd0SJeremy L Thompson   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
760e79b91d9SJeremy L Thompson   CeedSize lsize;
76188db6d3bSJeremy L Thompson   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
762*1d47fde2SJeremy L Thompson           numoutputfields, ncomp, dim = 1;
763e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
764*1d47fde2SJeremy L Thompson   Q1d = Q;
765e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
766241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
7677e7773b5SJeremy L Thompson   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
768e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
769241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
7707e7773b5SJeremy L Thompson   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
771e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
772241a4b83SYohann   CeedEvalMode emode;
773241a4b83SYohann   CeedBasis basis;
774241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
775241a4b83SYohann   CeedElemRestriction Erestrict;
776461525f5SNatalie Beams   CeedElemRestriction_Cuda *restr_data;
777241a4b83SYohann 
7780b454692Sjeremylt   // Check for restriction only identity operator
7790b454692Sjeremylt   bool is_identity_qf;
7800b454692Sjeremylt   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
7810b454692Sjeremylt   if (is_identity_qf) {
7820b454692Sjeremylt     CeedEvalMode emodein, emodeout;
7830b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
7840b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
7850b454692Sjeremylt     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
7860b454692Sjeremylt       // LCOV_EXCL_START
7870b454692Sjeremylt       return CeedError(ceed, CEED_ERROR_BACKEND,
7880b454692Sjeremylt                        "Backend does not implement restriction only identity operators");
7890b454692Sjeremylt     // LCOV_EXCL_STOP
7900b454692Sjeremylt   }
7910b454692Sjeremylt 
792241a4b83SYohann   ostringstream code;
793241a4b83SYohann   string devFunctions(deviceFunctions);
794241a4b83SYohann 
795f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
796f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
797f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
79888db6d3bSJeremy L Thompson   ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr);
7990d0321e0SJeremy L Thompson   ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr);
80080a9ef05SNatalie Beams   if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){
801f1a13f77SYohann Dudouit     code << atomicAdd;
802f1a13f77SYohann Dudouit   }
803f1a13f77SYohann Dudouit 
804241a4b83SYohann   code << devFunctions;
805241a4b83SYohann 
806241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
807c3c443faSYohann Dudouit   string qFunctionName(qf_data->qFunctionName);
808c3c443faSYohann Dudouit   string oper;
80914cce66cSYohann Dudouit   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
8104d537eeaSYohann 
8114d537eeaSYohann   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
81277841947SLeila Ghaffari   code << "#define CEED_QFUNCTION_HELPER inline __device__\n";
813e15f9bd0SJeremy L Thompson   code << "#define CeedPragmaSIMD\n";
814e15f9bd0SJeremy L Thompson   code << "#define CEED_ERROR_SUCCESS 0\n\n";
8151da99368SJeremy L Thompson 
8161da99368SJeremy L Thompson   // Find dim and Q1d
81764d3f0c0Sjeremylt   bool useCollograd = true;
81818d499f1SYohann   data->maxP1d = 0;
8191da99368SJeremy L Thompson   for (CeedInt i = 0; i < numinputfields; i++) {
820e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
8211da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
822e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8230f54b25eSjeremylt       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
824e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8251da99368SJeremy L Thompson 
8261da99368SJeremy L Thompson       // Check for collocated gradient
827437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8281da99368SJeremy L Thompson 
8291da99368SJeremy L Thompson       // Collect dim and Q1d
830e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8311da99368SJeremy L Thompson       bool isTensor;
832e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8331da99368SJeremy L Thompson       if (isTensor) {
834e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
835e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
83618d499f1SYohann         if (P1d>data->maxP1d) data->maxP1d = P1d;
8371da99368SJeremy L Thompson       } else {
838e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
839e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
840e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8411da99368SJeremy L Thompson         }
8421da99368SJeremy L Thompson     }
8431da99368SJeremy L Thompson   }
8441da99368SJeremy L Thompson   // Check output bases for Q1d, dim as well
8451da99368SJeremy L Thompson   //   The only imput basis might be CEED_BASIS_COLLOCATED
8461da99368SJeremy L Thompson   for (CeedInt i = 0; i < numoutputfields; i++) {
847e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
8480f54b25eSjeremylt 
8491da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
850e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8510f54b25eSjeremylt       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
852e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8530f54b25eSjeremylt 
8541da99368SJeremy L Thompson       // Collect dim and Q1d
855e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8561da99368SJeremy L Thompson       bool isTensor;
857e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8581da99368SJeremy L Thompson       if (isTensor) {
859e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
8601da99368SJeremy L Thompson       } else {
861e9f4dca0SJeremy L Thompson         // LCOV_EXCL_START
862e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
863e9f4dca0SJeremy L Thompson         // LCOV_EXCL_STOP
8641da99368SJeremy L Thompson         }
8650f54b25eSjeremylt 
8660f54b25eSjeremylt       // Check for collocated gradient
867437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8681da99368SJeremy L Thompson     }
8691da99368SJeremy L Thompson   }
8701da99368SJeremy L Thompson   data->dim = dim;
8711da99368SJeremy L Thompson   data->Q1d = Q1d;
8721da99368SJeremy L Thompson 
8731da99368SJeremy L Thompson   // Define CEED_Q_VLA
87464d3f0c0Sjeremylt   if (dim != 3 || useCollograd) {
8751da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA 1\n\n";
8761da99368SJeremy L Thompson   } else {
8771da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
8781da99368SJeremy L Thompson   }
8791da99368SJeremy L Thompson 
880241a4b83SYohann   code << qFunction;
881241a4b83SYohann 
882241a4b83SYohann   // Setup
883818e0025SJeremy L Thompson   code << "\n// -----------------------------------------------------------------------------\n";
884c3c443faSYohann Dudouit   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
885241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
886241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
887e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8881da99368SJeremy L Thompson     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
889241a4b83SYohann       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
890241a4b83SYohann     }
891241a4b83SYohann   }
892241a4b83SYohann 
893241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
894241a4b83SYohann     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
895241a4b83SYohann   }
8961da99368SJeremy L Thompson 
897241a4b83SYohann   code << "  const CeedInt Dim = "<<dim<<";\n";
898241a4b83SYohann   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
8991da99368SJeremy L Thompson 
900241a4b83SYohann   code << "  extern __shared__ CeedScalar slice[];\n";
901241a4b83SYohann   code << "  BackendData data;\n";
902241a4b83SYohann   code << "  data.tidx = threadIdx.x;\n";
903241a4b83SYohann   code << "  data.tidy = threadIdx.y;\n";
904241a4b83SYohann   code << "  data.tidz = threadIdx.z;\n";
905241a4b83SYohann   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
90618d499f1SYohann   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
907920dcdc4Sjeremylt 
908818e0025SJeremy L Thompson   code << "\n  // -- Input field constants and basis data --\n";
909ac421f39SYohann   //Initialize constants, and matrices B and G
910241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
911818e0025SJeremy L Thompson     code << "  // ---- Input field "<<i<<" ----\n";
912241a4b83SYohann     // Get elemsize, emode, ncomp
913241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
914e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
915241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
916e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
917241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
918e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9194d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
920e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
921920dcdc4Sjeremylt 
922920dcdc4Sjeremylt     // Set field constants
923920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT) {
924e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
925920dcdc4Sjeremylt       if (basis != CEED_BASIS_COLLOCATED) {
926e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
927920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
928920dcdc4Sjeremylt       } else {
929920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
930920dcdc4Sjeremylt       }
931920dcdc4Sjeremylt       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
932920dcdc4Sjeremylt     }
933920dcdc4Sjeremylt 
934920dcdc4Sjeremylt     // Load basis data
935920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
936241a4b83SYohann     switch (emode) {
937241a4b83SYohann     case CEED_EVAL_NONE:
938241a4b83SYohann       break;
939241a4b83SYohann     case CEED_EVAL_INTERP:
940e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
941437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
94280a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
943241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
944241a4b83SYohann       break;
945241a4b83SYohann     case CEED_EVAL_GRAD:
946e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
947437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
94880a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
949241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
95064d3f0c0Sjeremylt       if (useCollograd) {
951437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_collo_grad_1d;
95280a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
953ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
954ac421f39SYohann       } else {
955437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_grad_1d;
95680a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
957241a4b83SYohann         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
958ac421f39SYohann       }
959241a4b83SYohann       break;
960241a4b83SYohann     case CEED_EVAL_WEIGHT:
961241a4b83SYohann       break; // No action
962241a4b83SYohann     case CEED_EVAL_DIV:
963241a4b83SYohann       break; // TODO: Not implemented
964241a4b83SYohann     case CEED_EVAL_CURL:
965241a4b83SYohann       break; // TODO: Not implemented
966241a4b83SYohann     }
967241a4b83SYohann   }
968920dcdc4Sjeremylt 
969818e0025SJeremy L Thompson   code << "\n  // -- Output field constants and basis data --\n";
970241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
971818e0025SJeremy L Thompson     code << "  // ---- Output field "<<i<<" ----\n";
972241a4b83SYohann     // Get elemsize, emode, ncomp
973241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
974e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
975241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
976e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
977241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
978e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9794d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
980e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
981920dcdc4Sjeremylt 
982920dcdc4Sjeremylt     // Set field constants
983e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
984920dcdc4Sjeremylt     if (basis != CEED_BASIS_COLLOCATED) {
985e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
986241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
987920dcdc4Sjeremylt     } else {
988920dcdc4Sjeremylt       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
989920dcdc4Sjeremylt     }
990920dcdc4Sjeremylt     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
991920dcdc4Sjeremylt 
992920dcdc4Sjeremylt     // Load basis data
993920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
994920dcdc4Sjeremylt     switch (emode) {
995920dcdc4Sjeremylt     case CEED_EVAL_NONE:
996920dcdc4Sjeremylt       break; // No action
997920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
998e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
999437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
100080a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1001241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1002241a4b83SYohann       break;
1003241a4b83SYohann     case CEED_EVAL_GRAD:
1004e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1005437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
100680a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1007241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
100864d3f0c0Sjeremylt       if (useCollograd) {
1009437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_collo_grad_1d;
101080a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1011ac421f39SYohann         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1012ac421f39SYohann       } else {
1013437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_grad_1d;
101480a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1015241a4b83SYohann         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1016ac421f39SYohann       }
1017241a4b83SYohann       break;
1018e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
1019241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1020241a4b83SYohann       Ceed ceed;
1021e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1022e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
1023241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1024241a4b83SYohann       break; // Should not occur
1025241a4b83SYohann     }
1026241a4b83SYohann     case CEED_EVAL_DIV:
1027241a4b83SYohann       break; // TODO: Not implemented
1028241a4b83SYohann     case CEED_EVAL_CURL:
1029241a4b83SYohann       break; // TODO: Not implemented
1030e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
1031241a4b83SYohann     }
1032241a4b83SYohann   }
1033818e0025SJeremy L Thompson   code << "\n  // -- Element loop --\n";
1034ac421f39SYohann   code << "  __syncthreads();\n";
1035241a4b83SYohann   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1036241a4b83SYohann   // Input basis apply if needed
1037ac421f39SYohann   // Generate the correct eval mode code for each input
1038818e0025SJeremy L Thompson   code << "    // -- Input field restrictions and basis actions --\n";
1039241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1040818e0025SJeremy L Thompson     code << "    // ---- Input field "<<i<<" ----\n";
1041241a4b83SYohann     // Get elemsize, emode, ncomp
1042241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1043e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1044241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1045e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1046241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1047e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10484d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1049e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1050920dcdc4Sjeremylt 
1051920dcdc4Sjeremylt     // Restriction
1052920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT &&
105364d3f0c0Sjeremylt         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1054241a4b83SYohann       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
10559a2291e3SJeremy L Thompson 
10569a2291e3SJeremy L Thompson       bool isStrided;
1057e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
10589a2291e3SJeremy L Thompson       if (!isStrided) {
10595c7b696cSjeremylt         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1060e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10615c7b696cSjeremylt         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
10625c7b696cSjeremylt         CeedInt compstride;
1063e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
10645c7b696cSjeremylt         code << "    // CompStride: "<<compstride<<"\n";
1065e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
10669a2291e3SJeremy L Thompson         data->indices.in[i] = restr_data->d_ind;
10675c7b696cSjeremylt         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1068ccedf6b0Sjeremylt       } else {
1069920dcdc4Sjeremylt         bool backendstrides;
1070fd364f38SJeremy L Thompson         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1071e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
107213585805SJeremy L Thompson         CeedInt nelem;
107313585805SJeremy L Thompson         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1074e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
107513585805SJeremy L Thompson         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1076920dcdc4Sjeremylt         if (!backendstrides) {
1077920dcdc4Sjeremylt           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1078e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
1079ccedf6b0Sjeremylt         }
1080920dcdc4Sjeremylt         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1081d80fc06aSjeremylt         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1082920dcdc4Sjeremylt       }
1083920dcdc4Sjeremylt     }
1084920dcdc4Sjeremylt 
1085920dcdc4Sjeremylt     // Basis action
1086920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1087920dcdc4Sjeremylt     switch (emode) {
1088920dcdc4Sjeremylt     case CEED_EVAL_NONE:
108964d3f0c0Sjeremylt       if (!useCollograd) {
1090920dcdc4Sjeremylt         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1091920dcdc4Sjeremylt       }
1092920dcdc4Sjeremylt       break;
1093920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
1094241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1095241a4b83SYohann       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1096241a4b83SYohann       break;
1097241a4b83SYohann     case CEED_EVAL_GRAD:
109864d3f0c0Sjeremylt       if (useCollograd) {
1099ac421f39SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1100ac421f39SYohann         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1101ac421f39SYohann       } else {
1102241a4b83SYohann         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1103241a4b83SYohann         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1104ac421f39SYohann       }
1105241a4b83SYohann       break;
1106241a4b83SYohann     case CEED_EVAL_WEIGHT:
1107241a4b83SYohann       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1108e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1109e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1110437930d1SJeremy L Thompson       data->W = basis_data->d_q_weight_1d;
1111241a4b83SYohann       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1112241a4b83SYohann       break; // No action
1113241a4b83SYohann     case CEED_EVAL_DIV:
1114241a4b83SYohann       break; // TODO: Not implemented
1115241a4b83SYohann     case CEED_EVAL_CURL:
1116241a4b83SYohann       break; // TODO: Not implemented
1117241a4b83SYohann     }
1118241a4b83SYohann   }
1119ac421f39SYohann 
1120241a4b83SYohann   // Q function
1121818e0025SJeremy L Thompson   code << "\n    // -- Output field setup --\n";
1122241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1123818e0025SJeremy L Thompson       code << "\n    // ---- Output field "<<i<<" ----\n";
1124241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1125e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1126241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
1127241a4b83SYohann     {
112864d3f0c0Sjeremylt       if (useCollograd) {
1129ac421f39SYohann         //Accumulator for gradient slices
1130ac421f39SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1131ac421f39SYohann         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1132ac421f39SYohann         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1133ac421f39SYohann         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1134ac421f39SYohann         code << "      }\n";
1135ac421f39SYohann         code << "    }\n";
1136ac421f39SYohann       } else {
1137241a4b83SYohann         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1138241a4b83SYohann       }
1139ac421f39SYohann     }
1140241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1141241a4b83SYohann     {
1142241a4b83SYohann       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1143241a4b83SYohann     }
1144241a4b83SYohann   }
1145ac421f39SYohann   // We treat quadrature points per slice in 3d to save registers
114664d3f0c0Sjeremylt   if (useCollograd) {
1147920dcdc4Sjeremylt     code << "\n    // Note: Collocated Gradient\n";
1148ac421f39SYohann     code << "#pragma unroll\n";
1149ac421f39SYohann     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1150818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1151ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1152818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1153ac421f39SYohann       // Get elemsize, emode, ncomp
1154ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1155e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1156ac421f39SYohann       // Basis action
1157920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1158ac421f39SYohann       switch (emode) {
1159ac421f39SYohann       case CEED_EVAL_NONE:
1160ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11619a2291e3SJeremy L Thompson 
11629a2291e3SJeremy L Thompson         bool isStrided;
1163e15f9bd0SJeremy L Thompson         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1164e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1165e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
11669a2291e3SJeremy L Thompson         if (!isStrided) {
11675c7b696cSjeremylt           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1168e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11695c7b696cSjeremylt           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
11705c7b696cSjeremylt           CeedInt compstride;
1171e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
11725c7b696cSjeremylt           code << "      // CompStride: "<<compstride<<"\n";
1173e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
11749a2291e3SJeremy L Thompson           data->indices.in[i] = restr_data->d_ind;
11755c7b696cSjeremylt           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1176920dcdc4Sjeremylt         } else {
1177920dcdc4Sjeremylt           bool backendstrides;
1178fd364f38SJeremy L Thompson           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1179e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
118013585805SJeremy L Thompson           CeedInt nelem;
118113585805SJeremy L Thompson           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1182e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
118313585805SJeremy L Thompson           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1184920dcdc4Sjeremylt           if (!backendstrides) {
1185920dcdc4Sjeremylt             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1186e15f9bd0SJeremy L Thompson             CeedChkBackend(ierr);
1187920dcdc4Sjeremylt           }
1188920dcdc4Sjeremylt           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1189d80fc06aSjeremylt           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1190920dcdc4Sjeremylt         }
1191ac421f39SYohann         break;
1192ac421f39SYohann       case CEED_EVAL_INTERP:
1193ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1194ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1195ac421f39SYohann         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1196ac421f39SYohann         code << "      }\n";
1197ac421f39SYohann         break;
1198ac421f39SYohann       case CEED_EVAL_GRAD:
1199ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1200ac421f39SYohann         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1201ac421f39SYohann         break;
1202ac421f39SYohann       case CEED_EVAL_WEIGHT:
1203ac421f39SYohann         code << "      CeedScalar r_q"<<i<<"[1];\n";
1204ac421f39SYohann         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1205ac421f39SYohann         break; // No action
1206ac421f39SYohann       case CEED_EVAL_DIV:
1207ac421f39SYohann         break; // TODO: Not implemented
1208ac421f39SYohann       case CEED_EVAL_CURL:
1209ac421f39SYohann         break; // TODO: Not implemented
1210ac421f39SYohann       }
1211ac421f39SYohann     }
1212818e0025SJeremy L Thompson     code << "\n      // -- Output fields --\n";
1213ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1214818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1215ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1216e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1217ac421f39SYohann       // Basis action
1218ac421f39SYohann       switch (emode) {
1219ac421f39SYohann       case CEED_EVAL_NONE:
1220ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1221ac421f39SYohann         break; // No action
1222ac421f39SYohann       case CEED_EVAL_INTERP:
1223ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1224ac421f39SYohann         break;
1225ac421f39SYohann       case CEED_EVAL_GRAD:
1226ac421f39SYohann         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1227ac421f39SYohann         break;
1228ac421f39SYohann       case CEED_EVAL_WEIGHT:
1229ac421f39SYohann         break; // Should not occur
1230ac421f39SYohann       case CEED_EVAL_DIV:
1231ac421f39SYohann         break; // TODO: Not implemented
1232ac421f39SYohann       case CEED_EVAL_CURL:
1233ac421f39SYohann         break; // TODO: Not implemented
1234ac421f39SYohann       }
1235ac421f39SYohann     }
1236ac421f39SYohann   } else {
1237920dcdc4Sjeremylt     code << "\n      // Note: No Collocated Gradient\n";
1238818e0025SJeremy L Thompson     code << "      // -- Input fields --\n";
1239ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1240818e0025SJeremy L Thompson       code << "      // ---- Input field "<<i<<" ----\n";
1241ac421f39SYohann       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1242ac421f39SYohann     }
1243818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1244ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1245818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1246ac421f39SYohann       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1247ac421f39SYohann     }
1248ac421f39SYohann   }
1249818e0025SJeremy L Thompson   code << "\n      // -- QFunction Inputs and outputs --\n";
12504d537eeaSYohann   code << "      CeedScalar* in["<<numinputfields<<"];\n";
12514d537eeaSYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1252818e0025SJeremy L Thompson     code << "      // ---- Input field "<<i<<" ----\n";
1253ac421f39SYohann     code << "      in["<<i<<"] = r_q"<<i<<";\n";
12544d537eeaSYohann   }
12554d537eeaSYohann   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
12564d537eeaSYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1257818e0025SJeremy L Thompson     code << "      // ---- Output field "<<i<<" ----\n";
1258ac421f39SYohann     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
12594d537eeaSYohann   }
1260818e0025SJeremy L Thompson   code << "\n      // -- Apply QFunction --\n";
1261ac421f39SYohann   code << "      "<<qFunctionName<<"(ctx, ";
126264d3f0c0Sjeremylt   if (dim != 3 || useCollograd) {
1263ac421f39SYohann     code << "1";
1264ac421f39SYohann   } else {
1265ac421f39SYohann     code << "Q1d";
1266ac421f39SYohann   }
1267ac421f39SYohann   code << ", in, out);\n";
126864d3f0c0Sjeremylt   if (useCollograd) {
1269920dcdc4Sjeremylt     code << "\n      // Note: Collocated Gradient\n";
1270818e0025SJeremy L Thompson     code << "      // -- Output fields --\n";
1271ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1272818e0025SJeremy L Thompson       code << "      // ---- Output field "<<i<<" ----\n";
1273ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1274e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
1275ac421f39SYohann       // Basis action
1276920dcdc4Sjeremylt       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1277ac421f39SYohann       switch (emode) {
1278ac421f39SYohann       case CEED_EVAL_NONE:
1279ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1280ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1281ac421f39SYohann         code << "      }\n";
1282ac421f39SYohann         break; // No action
1283ac421f39SYohann       case CEED_EVAL_INTERP:
1284ac421f39SYohann         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1285ac421f39SYohann         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1286ac421f39SYohann         code << "      }\n";
1287ac421f39SYohann         break;
1288ac421f39SYohann       case CEED_EVAL_GRAD:
1289ac421f39SYohann         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1290ac421f39SYohann         break;
1291ac421f39SYohann       case CEED_EVAL_WEIGHT:
1292ac421f39SYohann         break; // Should not occur
1293ac421f39SYohann       case CEED_EVAL_DIV:
1294ac421f39SYohann         break; // TODO: Not implemented
1295ac421f39SYohann       case CEED_EVAL_CURL:
1296ac421f39SYohann         break; // TODO: Not implemented
1297ac421f39SYohann       }
1298ac421f39SYohann     }
1299ac421f39SYohann     code << "    }\n";
1300ac421f39SYohann   }
1301241a4b83SYohann 
1302241a4b83SYohann   // Output basis apply if needed
1303ac421f39SYohann   // Generate the correct eval mode code for each output
1304818e0025SJeremy L Thompson   code << "\n    // -- Output field basis action and restrictions --\n";
1305241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1306818e0025SJeremy L Thompson     code << "    // ---- Output field "<<i<<" ----\n";
1307241a4b83SYohann     // Get elemsize, emode, ncomp
1308241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1309e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1310241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1311e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1312241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1313e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13144d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1315e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
1316241a4b83SYohann     // Basis action
1317920dcdc4Sjeremylt     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1318241a4b83SYohann     switch (emode) {
1319241a4b83SYohann     case CEED_EVAL_NONE:
1320920dcdc4Sjeremylt       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1321241a4b83SYohann       break; // No action
1322241a4b83SYohann     case CEED_EVAL_INTERP:
1323241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1324241a4b83SYohann       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1325241a4b83SYohann       break;
1326241a4b83SYohann     case CEED_EVAL_GRAD:
1327241a4b83SYohann       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
132864d3f0c0Sjeremylt       if (useCollograd) {
1329ac421f39SYohann         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1330ac421f39SYohann       } else {
1331241a4b83SYohann         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1332ac421f39SYohann       }
1333241a4b83SYohann       break;
1334e9f4dca0SJeremy L Thompson     // LCOV_EXCL_START
1335241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1336241a4b83SYohann       Ceed ceed;
1337e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1338e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
1339241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1340241a4b83SYohann       break; // Should not occur
1341241a4b83SYohann     }
1342241a4b83SYohann     case CEED_EVAL_DIV:
1343241a4b83SYohann       break; // TODO: Not implemented
1344241a4b83SYohann     case CEED_EVAL_CURL:
1345241a4b83SYohann       break; // TODO: Not implemented
1346e9f4dca0SJeremy L Thompson       // LCOV_EXCL_STOP
1347241a4b83SYohann     }
1348920dcdc4Sjeremylt     // Restriction
13499a2291e3SJeremy L Thompson       bool isStrided;
1350e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
13519a2291e3SJeremy L Thompson     if (!isStrided) {
13525c7b696cSjeremylt       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1353e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13545c7b696cSjeremylt       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
13555c7b696cSjeremylt       CeedInt compstride;
1356e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
13575c7b696cSjeremylt       code << "    // CompStride: "<<compstride<<"\n";
1358e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
13599a2291e3SJeremy L Thompson       data->indices.out[i] = restr_data->d_ind;
13605c7b696cSjeremylt       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1361920dcdc4Sjeremylt     } else {
1362920dcdc4Sjeremylt       bool backendstrides;
1363fd364f38SJeremy L Thompson       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1364e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
136513585805SJeremy L Thompson       CeedInt nelem;
136613585805SJeremy L Thompson       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1367e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
136813585805SJeremy L Thompson       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1369920dcdc4Sjeremylt       if (!backendstrides) {
1370920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1371e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
1372920dcdc4Sjeremylt       }
1373920dcdc4Sjeremylt       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1374d80fc06aSjeremylt       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1375920dcdc4Sjeremylt     }
1376241a4b83SYohann   }
1377241a4b83SYohann 
1378241a4b83SYohann   code << "  }\n";
1379818e0025SJeremy L Thompson   code << "}\n";
1380818e0025SJeremy L Thompson   code << "// -----------------------------------------------------------------------------\n\n";
1381241a4b83SYohann 
1382ab213215SJeremy L Thompson   // View kernel for debugging
138346dc0734SJeremy L Thompson   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
13843f21f6b1SJeremy L Thompson   CeedDebug(ceed, code.str().c_str());
1385241a4b83SYohann 
138618d499f1SYohann   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
138718d499f1SYohann                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1388e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
1389c3c443faSYohann Dudouit   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1390e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
1391241a4b83SYohann 
1392e15f9bd0SJeremy L Thompson   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1393e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
1394241a4b83SYohann }
1395ab213215SJeremy L Thompson //------------------------------------------------------------------------------
1396