xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 5c7b696c8582f667cb0fcf7d02b9ef3156803fee)
1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4241a4b83SYohann //
5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7241a4b83SYohann // element discretizations for exascale applications. For more information and
8241a4b83SYohann // source code availability see http://github.com/ceed.
9241a4b83SYohann //
10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
16241a4b83SYohann #include <ceed-backend.h>
17241a4b83SYohann #include "ceed-cuda-gen.h"
18241a4b83SYohann #include <iostream>
19241a4b83SYohann #include <sstream>
20241a4b83SYohann #include "../cuda/ceed-cuda.h"
21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h"
22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
23241a4b83SYohann 
24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
25241a4b83SYohann __device__ double atomicAdd(double *address, double val) {
26241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
27241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
28241a4b83SYohann   do {
29241a4b83SYohann     assumed = old;
30241a4b83SYohann     old =
31241a4b83SYohann       atomicCAS(address_as_ull, assumed,
32241a4b83SYohann                 __double_as_longlong(val +
33241a4b83SYohann                                      __longlong_as_double(assumed)));
34241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
35241a4b83SYohann     // (since NaN != NaN)
36241a4b83SYohann   } while (assumed != old);
37241a4b83SYohann   return __longlong_as_double(old);
38241a4b83SYohann }
39f1a13f77SYohann Dudouit );
40f1a13f77SYohann Dudouit 
41f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
42f1a13f77SYohann Dudouit 
43f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
44f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
45f1a13f77SYohann Dudouit 
46f1a13f77SYohann Dudouit typedef struct {
47f1a13f77SYohann Dudouit   CeedInt tidx;
48f1a13f77SYohann Dudouit   CeedInt tidy;
49f1a13f77SYohann Dudouit   CeedInt tidz;
50f1a13f77SYohann Dudouit   CeedInt tid;
51f1a13f77SYohann Dudouit   CeedScalar* slice;
52f1a13f77SYohann Dudouit } BackendData;
53241a4b83SYohann 
54241a4b83SYohann template <int P, int Q>
55241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
56ac421f39SYohann   for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
57241a4b83SYohann     B[i] = d_B[i];
58241a4b83SYohann   }
59241a4b83SYohann }
60241a4b83SYohann 
61241a4b83SYohann //****
62241a4b83SYohann // 1D
63*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
64*5c7b696cSjeremylt inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
65241a4b83SYohann   if (data.tidx<P1d)
66241a4b83SYohann   {
674d537eeaSYohann     const CeedInt node = data.tidx;
68ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
69241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
70*5c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
71241a4b83SYohann     }
72241a4b83SYohann   }
73241a4b83SYohann }
74920dcdc4Sjeremylt 
75d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
76d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
77ccedf6b0Sjeremylt   if (data.tidx<P1d)
78ccedf6b0Sjeremylt   {
79ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
80d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
81ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
82d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
83ccedf6b0Sjeremylt     }
84ccedf6b0Sjeremylt   }
85ccedf6b0Sjeremylt }
86241a4b83SYohann 
87*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
88*5c7b696cSjeremylt inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
89241a4b83SYohann   if (data.tidx<P1d)
90241a4b83SYohann   {
914d537eeaSYohann     const CeedInt node = data.tidx;
92ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
93241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
94*5c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
95241a4b83SYohann     }
96241a4b83SYohann   }
97241a4b83SYohann }
98241a4b83SYohann 
99d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
100d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
101ccedf6b0Sjeremylt   if (data.tidx<P1d)
102ccedf6b0Sjeremylt   {
103ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
104d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
105ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
106d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
107ccedf6b0Sjeremylt     }
108ccedf6b0Sjeremylt   }
109ccedf6b0Sjeremylt }
110ccedf6b0Sjeremylt 
111241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
112241a4b83SYohann inline __device__ void ContractX1d(BackendData& data,
113241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
114241a4b83SYohann   data.slice[data.tidx] = *U;
115241a4b83SYohann   __syncthreads();
116241a4b83SYohann   *V = 0.0;
117ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
118241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
119241a4b83SYohann   }
120241a4b83SYohann   __syncthreads();
121241a4b83SYohann }
122241a4b83SYohann 
123241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
124241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data,
125241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
126241a4b83SYohann   data.slice[data.tidx] = *U;
127241a4b83SYohann   __syncthreads();
128241a4b83SYohann   *V = 0.0;
129ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
130241a4b83SYohann     *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
131241a4b83SYohann   }
132241a4b83SYohann   __syncthreads();
133241a4b83SYohann }
134241a4b83SYohann 
135241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
136241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
137241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
138ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
139241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
140241a4b83SYohann   }
141241a4b83SYohann }
142241a4b83SYohann 
143241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
144241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
145241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
146ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
147241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
148241a4b83SYohann   }
149241a4b83SYohann }
150241a4b83SYohann 
151241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
152241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
153241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
154241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
155ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
156241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
157241a4b83SYohann   }
158241a4b83SYohann }
159241a4b83SYohann 
160241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
161241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
162241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
163241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
164ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
165241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
166241a4b83SYohann   }
167241a4b83SYohann }
168241a4b83SYohann 
169241a4b83SYohann //****
170241a4b83SYohann // 2D
171*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
172*5c7b696cSjeremylt inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
173241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
174241a4b83SYohann   {
1754d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
176ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
177241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
178*5c7b696cSjeremylt       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
179241a4b83SYohann     }
180241a4b83SYohann   }
181241a4b83SYohann }
182920dcdc4Sjeremylt 
183d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
184d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
185ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d)
186ccedf6b0Sjeremylt   {
187ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
188d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
189ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
190d80fc06aSjeremylt       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
191ccedf6b0Sjeremylt     }
192ccedf6b0Sjeremylt   }
193ccedf6b0Sjeremylt }
194241a4b83SYohann 
195*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
196*5c7b696cSjeremylt inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
197241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
198241a4b83SYohann   {
1994d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
200ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
201241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
202*5c7b696cSjeremylt       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
203241a4b83SYohann     }
204241a4b83SYohann   }
205241a4b83SYohann }
206241a4b83SYohann 
207d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
208d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
209ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d)
210ccedf6b0Sjeremylt   {
211ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
212d80fc06aSjeremylt     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
213ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
214d80fc06aSjeremylt       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
215ccedf6b0Sjeremylt     }
216ccedf6b0Sjeremylt   }
217ccedf6b0Sjeremylt }
218ccedf6b0Sjeremylt 
219241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
220241a4b83SYohann inline __device__ void ContractX2d(BackendData& data,
221241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
222241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
223241a4b83SYohann   __syncthreads();
224241a4b83SYohann   *V = 0.0;
225ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
226241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
227241a4b83SYohann   }
228241a4b83SYohann   __syncthreads();
229241a4b83SYohann }
230241a4b83SYohann 
231241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
232241a4b83SYohann inline __device__ void ContractY2d(BackendData& data,
233241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
234241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
235241a4b83SYohann   __syncthreads();
236241a4b83SYohann   *V = 0.0;
237ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
238241a4b83SYohann     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
239241a4b83SYohann   }
240241a4b83SYohann   __syncthreads();
241241a4b83SYohann }
242241a4b83SYohann 
243241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
244241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data,
245241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
246241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
247241a4b83SYohann   __syncthreads();
248241a4b83SYohann   *V = 0.0;
249241a4b83SYohann   if (data.tidy<P1d) {
250ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
251241a4b83SYohann       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
252241a4b83SYohann     }
253241a4b83SYohann   }
254241a4b83SYohann   __syncthreads();
255241a4b83SYohann }
256241a4b83SYohann 
257241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
258241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data,
259241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
260241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
261241a4b83SYohann   __syncthreads();
262241a4b83SYohann   *V = 0.0;
263241a4b83SYohann   if (data.tidx<P1d) {
264ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
265241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
266241a4b83SYohann     }
267241a4b83SYohann   }
268241a4b83SYohann   __syncthreads();
269241a4b83SYohann }
270241a4b83SYohann 
271241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
272241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data,
273241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
274241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
275241a4b83SYohann   __syncthreads();
276241a4b83SYohann   if (data.tidx<P1d) {
277ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
278241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
279241a4b83SYohann     }
280241a4b83SYohann   }
281241a4b83SYohann   __syncthreads();
282241a4b83SYohann }
283241a4b83SYohann 
284241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
285241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
286241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
287241a4b83SYohann   CeedScalar r_t[1];
288ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
289241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
290241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
291241a4b83SYohann   }
292241a4b83SYohann }
293241a4b83SYohann 
294241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
295241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
296241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
297241a4b83SYohann   CeedScalar r_t[1];
298ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
299241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
300241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
301241a4b83SYohann   }
302241a4b83SYohann }
303241a4b83SYohann 
304241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
305241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
306241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
307241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
308241a4b83SYohann   CeedScalar r_t[1];
309ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
310241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
311241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
312241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
313241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
314241a4b83SYohann   }
315241a4b83SYohann }
316241a4b83SYohann 
317241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
318241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
319241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
320241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
321241a4b83SYohann   CeedScalar r_t[1];
322ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
323241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
324241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
325241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
326241a4b83SYohann     ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
327241a4b83SYohann   }
328241a4b83SYohann }
329241a4b83SYohann 
330241a4b83SYohann //****
331241a4b83SYohann // 3D
332*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
333*5c7b696cSjeremylt inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
334241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
335241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
3364d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
337ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
338241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
339*5c7b696cSjeremylt         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
340241a4b83SYohann       }
341241a4b83SYohann     }
342241a4b83SYohann   }
343241a4b83SYohann }
344920dcdc4Sjeremylt 
345d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
346d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
347ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d) {
348ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
349ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
350d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
351ccedf6b0Sjeremylt       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
352d80fc06aSjeremylt         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
353ccedf6b0Sjeremylt       }
354ccedf6b0Sjeremylt     }
355ccedf6b0Sjeremylt   }
356ccedf6b0Sjeremylt }
357241a4b83SYohann 
358*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d>
359*5c7b696cSjeremylt inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
360ac421f39SYohann   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
361920dcdc4Sjeremylt   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
362ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
363*5c7b696cSjeremylt     r_u[comp] = d_u[ind + COMPSTRIDE * comp];
364ac421f39SYohann   }
365ac421f39SYohann }
366ac421f39SYohann 
367d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
36825dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
369920dcdc4Sjeremylt   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
370d80fc06aSjeremylt   const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
371920dcdc4Sjeremylt   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
372d80fc06aSjeremylt     r_u[comp] = d_u[ind + comp * STRIDES_COMP];
373920dcdc4Sjeremylt   }
374920dcdc4Sjeremylt }
375920dcdc4Sjeremylt 
376*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d>
377*5c7b696cSjeremylt inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
378241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
379241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
3804d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
381ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
382241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
383*5c7b696cSjeremylt         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
384241a4b83SYohann       }
385241a4b83SYohann     }
386241a4b83SYohann   }
387241a4b83SYohann }
388241a4b83SYohann 
389d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
3902f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
391ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d) {
392ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
393ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
394d80fc06aSjeremylt       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
395ccedf6b0Sjeremylt       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
396d80fc06aSjeremylt         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
397ccedf6b0Sjeremylt       }
398ccedf6b0Sjeremylt     }
399ccedf6b0Sjeremylt   }
400ccedf6b0Sjeremylt }
401ccedf6b0Sjeremylt 
402241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
403241a4b83SYohann inline __device__ void ContractX3d(BackendData& data,
404241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
405ac421f39SYohann   CeedScalar r_B[P1d];
406ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
407ac421f39SYohann     r_B[i] = B[i + data.tidx*P1d];
408ac421f39SYohann   }
409ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
410241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
411241a4b83SYohann     __syncthreads();
412241a4b83SYohann     V[k] = 0.0;
413ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
414ac421f39SYohann       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
415241a4b83SYohann     }
416241a4b83SYohann     __syncthreads();
417241a4b83SYohann   }
418241a4b83SYohann }
419241a4b83SYohann 
420241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
421241a4b83SYohann inline __device__ void ContractY3d(BackendData& data,
422241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
423ac421f39SYohann   CeedScalar r_B[P1d];
424ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
425ac421f39SYohann     r_B[i] = B[i + data.tidy*P1d];
426ac421f39SYohann   }
427ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
428241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
429241a4b83SYohann     __syncthreads();
430241a4b83SYohann     V[k] = 0.0;
431ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
432ac421f39SYohann       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
433241a4b83SYohann     }
434241a4b83SYohann     __syncthreads();
435241a4b83SYohann   }
436241a4b83SYohann }
437241a4b83SYohann 
438241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
439241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data,
440241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
441ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
442241a4b83SYohann     V[k] = 0.0;
443ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
444241a4b83SYohann       V[k] += B[i + k*P1d] * U[i];//contract z direction
445241a4b83SYohann     }
446241a4b83SYohann   }
447241a4b83SYohann }
448241a4b83SYohann 
449241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
450241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data,
451241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
452ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
453241a4b83SYohann     V[k] = 0.0;
454241a4b83SYohann     if (k<P1d) {
455ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
456241a4b83SYohann         V[k] += B[k + i*P1d] * U[i];//contract z direction
457241a4b83SYohann       }
458241a4b83SYohann     }
459241a4b83SYohann   }
460241a4b83SYohann }
461241a4b83SYohann 
462241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
463241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data,
464241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
465ac421f39SYohann   CeedScalar r_B[Q1d];
466ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i)
467ac421f39SYohann   {
468ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
469ac421f39SYohann   }
470ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
471241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
472241a4b83SYohann     __syncthreads();
473241a4b83SYohann     V[k] = 0.0;
474241a4b83SYohann     if (data.tidy<P1d) {
475ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
476ac421f39SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
477ac421f39SYohann       }
478ac421f39SYohann     }
479ac421f39SYohann     __syncthreads();
480ac421f39SYohann   }
481ac421f39SYohann }
482ac421f39SYohann 
483ac421f39SYohann template <int NCOMP, int P1d, int Q1d>
484ac421f39SYohann inline __device__ void ContractTransposeAddY3d(BackendData& data,
485ac421f39SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
486ac421f39SYohann   CeedScalar r_B[Q1d];
487ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
488ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
489ac421f39SYohann   }
490ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
491ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
492ac421f39SYohann     __syncthreads();
493ac421f39SYohann     if (data.tidy<P1d) {
494ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
495ac421f39SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
496241a4b83SYohann       }
497241a4b83SYohann     }
498241a4b83SYohann     __syncthreads();
499241a4b83SYohann   }
500241a4b83SYohann }
501241a4b83SYohann 
502241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
503241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data,
504241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
505ac421f39SYohann   CeedScalar r_B[Q1d];
506ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
507ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
508ac421f39SYohann   }
509ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
510241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
511241a4b83SYohann     __syncthreads();
512241a4b83SYohann     V[k] = 0.0;
513241a4b83SYohann     if (data.tidx<P1d) {
514ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
515ac421f39SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
516241a4b83SYohann       }
517241a4b83SYohann     }
518241a4b83SYohann     __syncthreads();
519241a4b83SYohann   }
520241a4b83SYohann }
521241a4b83SYohann 
522241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
523241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data,
524241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
525ac421f39SYohann   CeedScalar r_B[Q1d];
526ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
527ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
528ac421f39SYohann   }
529ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
530241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
531241a4b83SYohann     __syncthreads();
532241a4b83SYohann     if (data.tidx<P1d) {
533ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
534ac421f39SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
535241a4b83SYohann       }
536241a4b83SYohann     }
537241a4b83SYohann     __syncthreads();
538241a4b83SYohann   }
539241a4b83SYohann }
540241a4b83SYohann 
541241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
542241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
543241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
544241a4b83SYohann   CeedScalar r_t1[Q1d];
545241a4b83SYohann   CeedScalar r_t2[Q1d];
546ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
547241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
548241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
549241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
550241a4b83SYohann   }
551241a4b83SYohann }
552241a4b83SYohann 
553241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
554241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
555241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
556241a4b83SYohann   CeedScalar r_t1[Q1d];
557241a4b83SYohann   CeedScalar r_t2[Q1d];
558ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
559241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
560241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
561241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
562241a4b83SYohann   }
563241a4b83SYohann }
564241a4b83SYohann 
565241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
566241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
567241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
568241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
569241a4b83SYohann   CeedScalar r_t1[Q1d];
570241a4b83SYohann   CeedScalar r_t2[Q1d];
571ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
572241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
573241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
574241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
575241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
576241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
577241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
578241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
579241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
580241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
581241a4b83SYohann   }
582241a4b83SYohann }
583241a4b83SYohann 
584241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
585241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
586241a4b83SYohann                                        const CeedScalar *c_B, const CeedScalar *c_G,
587241a4b83SYohann                                        CeedScalar *__restrict__ r_V) {
588241a4b83SYohann   CeedScalar r_t1[Q1d];
589241a4b83SYohann   CeedScalar r_t2[Q1d];
590ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
591241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
592241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
593241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
594241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
595241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
596241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
597241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
598241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
599241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
600241a4b83SYohann   }
601241a4b83SYohann }
602241a4b83SYohann 
603ac421f39SYohann template <int NCOMP, int Q1d>
604ac421f39SYohann inline __device__ void gradCollo3d(BackendData& data, const CeedInt q,
605ac421f39SYohann                                    const CeedScalar *__restrict__ r_U,
606ac421f39SYohann                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
607ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
608ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d];
609ac421f39SYohann     __syncthreads();
610ac421f39SYohann     // X derivative
611ac421f39SYohann     r_V[comp+0*NCOMP] = 0.0;
612ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
613ac421f39SYohann       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
614ac421f39SYohann     }
615ac421f39SYohann     // Y derivative
616ac421f39SYohann     r_V[comp+1*NCOMP] = 0.0;
617ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
618ac421f39SYohann       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
619ac421f39SYohann     }
620ac421f39SYohann     // Z derivative
621ac421f39SYohann     r_V[comp+2*NCOMP] = 0.0;
622ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
623ac421f39SYohann       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative)
624ac421f39SYohann     }
625ac421f39SYohann     __syncthreads();
626ac421f39SYohann   }
627ac421f39SYohann }
628ac421f39SYohann 
629ac421f39SYohann template <int NCOMP, int Q1d>
630ac421f39SYohann inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q,
631ac421f39SYohann                                             const CeedScalar *__restrict__ r_U,
632ac421f39SYohann                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
633ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
634ac421f39SYohann     // X derivative
635ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP];
636ac421f39SYohann     __syncthreads();
637ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
638ac421f39SYohann       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
639ac421f39SYohann     }
640ac421f39SYohann     __syncthreads();
641ac421f39SYohann     // Y derivative
642ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP];
643ac421f39SYohann     __syncthreads();
644ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
645ac421f39SYohann       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
646ac421f39SYohann     }
647ac421f39SYohann     __syncthreads();
648ac421f39SYohann     // Z derivative
649ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
650ac421f39SYohann       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative)
651ac421f39SYohann     }
652ac421f39SYohann   }
653ac421f39SYohann }
654ac421f39SYohann 
655241a4b83SYohann template <int Q1d>
656241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
657241a4b83SYohann   *w = qweight1d[data.tidx];
658241a4b83SYohann }
659241a4b83SYohann 
660241a4b83SYohann template <int Q1d>
661241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
662241a4b83SYohann   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
663241a4b83SYohann }
664241a4b83SYohann 
665241a4b83SYohann template <int Q1d>
666241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
667241a4b83SYohann   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
668ac421f39SYohann   for (CeedInt z = 0; z < Q1d; ++z)
669241a4b83SYohann   {
670241a4b83SYohann     w[z] = pw*qweight1d[z];
671241a4b83SYohann   }
672241a4b83SYohann }
673241a4b83SYohann 
674241a4b83SYohann );
675241a4b83SYohann 
676241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
677241a4b83SYohann 
678241a4b83SYohann   using std::ostringstream;
679241a4b83SYohann   using std::string;
680241a4b83SYohann   int ierr;
681241a4b83SYohann   bool setupdone;
682241a4b83SYohann   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
683241a4b83SYohann   if (setupdone) return 0;
684241a4b83SYohann   Ceed ceed;
685241a4b83SYohann   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
686241a4b83SYohann   CeedOperator_Cuda_gen *data;
687241a4b83SYohann   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
688241a4b83SYohann   CeedQFunction qf;
689241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
690241a4b83SYohann   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
691241a4b83SYohann   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
6921da99368SJeremy L Thompson   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
693*5c7b696cSjeremylt           numoutputfields, ncomp, dim = 0, lsize;
694241a4b83SYohann   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
695241a4b83SYohann   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
696241a4b83SYohann   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
697241a4b83SYohann   CeedChk(ierr);
698241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
699241a4b83SYohann   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
700241a4b83SYohann   CeedChk(ierr);
701241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
702241a4b83SYohann   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
703241a4b83SYohann   CeedChk(ierr);
704241a4b83SYohann   CeedEvalMode emode;
705241a4b83SYohann   CeedBasis basis;
706241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
707241a4b83SYohann   CeedElemRestriction Erestrict;
708241a4b83SYohann   CeedElemRestriction_Cuda_reg *restr_data;
709241a4b83SYohann 
710241a4b83SYohann   ostringstream code;
711241a4b83SYohann   string devFunctions(deviceFunctions);
712241a4b83SYohann 
713f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
714f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
715f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
716abfaacbbSSander Arens   ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr);
717f1a13f77SYohann Dudouit   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
718f1a13f77SYohann Dudouit   if (prop.major<6){
719f1a13f77SYohann Dudouit     code << atomicAdd;
720f1a13f77SYohann Dudouit   }
721f1a13f77SYohann Dudouit 
722241a4b83SYohann   code << devFunctions;
723241a4b83SYohann 
724241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
7254d537eeaSYohann 
7264d537eeaSYohann   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
727ee07ded2SValeria Barra   code << "\n#define CeedPragmaSIMD\n";
7281da99368SJeremy L Thompson 
7291da99368SJeremy L Thompson   // Find dim and Q1d
7301da99368SJeremy L Thompson   bool collograd = false;
7311da99368SJeremy L Thompson   for (CeedInt i = 0; i < numinputfields; i++) {
7321da99368SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
7331da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
7341da99368SJeremy L Thompson       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
7351da99368SJeremy L Thompson 
7361da99368SJeremy L Thompson       // Check for collocated gradient
7371da99368SJeremy L Thompson       if (basis_data->d_collograd1d)
7381da99368SJeremy L Thompson         collograd = true;
7391da99368SJeremy L Thompson 
7401da99368SJeremy L Thompson       // Collect dim and Q1d
7411da99368SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
7421da99368SJeremy L Thompson       bool isTensor;
7431da99368SJeremy L Thompson       ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
7441da99368SJeremy L Thompson       if (isTensor) {
7451da99368SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
7461da99368SJeremy L Thompson       } else {
7471da99368SJeremy L Thompson         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
7481da99368SJeremy L Thompson         }
7491da99368SJeremy L Thompson     }
7501da99368SJeremy L Thompson   }
7511da99368SJeremy L Thompson   // Check output bases for Q1d, dim as well
7521da99368SJeremy L Thompson   //   The only imput basis might be CEED_BASIS_COLLOCATED
7531da99368SJeremy L Thompson   for (CeedInt i = 0; i < numoutputfields; i++) {
7541da99368SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
7551da99368SJeremy L Thompson     if (basis != CEED_BASIS_COLLOCATED) {
7561da99368SJeremy L Thompson       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
7571da99368SJeremy L Thompson       // Collect dim and Q1d
7581da99368SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
7591da99368SJeremy L Thompson       bool isTensor;
7601da99368SJeremy L Thompson       ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
7611da99368SJeremy L Thompson       if (isTensor) {
7621da99368SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
7631da99368SJeremy L Thompson       } else {
7641da99368SJeremy L Thompson         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
7651da99368SJeremy L Thompson         }
7661da99368SJeremy L Thompson     }
7671da99368SJeremy L Thompson   }
7681da99368SJeremy L Thompson   data->dim = dim;
7691da99368SJeremy L Thompson   data->Q1d = Q1d;
7701da99368SJeremy L Thompson 
7711da99368SJeremy L Thompson   // Define CEED_Q_VLA
7721da99368SJeremy L Thompson   if (dim != 3 || collograd) {
7731da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA 1\n\n";
7741da99368SJeremy L Thompson   } else {
7751da99368SJeremy L Thompson     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
7761da99368SJeremy L Thompson   }
7771da99368SJeremy L Thompson 
778241a4b83SYohann   code << qFunction;
779241a4b83SYohann 
780241a4b83SYohann   // Setup
781d80fc06aSjeremylt   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
782241a4b83SYohann   // Input Evecs and Restriction
783241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
784241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
785241a4b83SYohann     CeedChk(ierr);
7861da99368SJeremy L Thompson     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
787241a4b83SYohann       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
788241a4b83SYohann     }
789241a4b83SYohann   }
790241a4b83SYohann 
791241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
792241a4b83SYohann     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
793241a4b83SYohann   }
7941da99368SJeremy L Thompson 
795241a4b83SYohann   code << "const CeedInt Dim = "<<dim<<";\n";
796241a4b83SYohann   code << "const CeedInt Q1d = "<<Q1d<<";\n";
7971da99368SJeremy L Thompson 
798241a4b83SYohann   code << "extern __shared__ CeedScalar slice[];\n";
799241a4b83SYohann   code << "BackendData data;\n";
800241a4b83SYohann   code << "data.tidx = threadIdx.x;\n";
801241a4b83SYohann   code << "data.tidy = threadIdx.y;\n";
802241a4b83SYohann   code << "data.tidz = threadIdx.z;\n";
803241a4b83SYohann   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
804241a4b83SYohann   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
805920dcdc4Sjeremylt 
806920dcdc4Sjeremylt   code << "\n// Input field constants and basis data\n";
807ac421f39SYohann   //Initialize constants, and matrices B and G
808241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
809920dcdc4Sjeremylt     code << "// -- Input field "<<i<<" --\n";
810241a4b83SYohann     // Get elemsize, emode, ncomp
811241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
812241a4b83SYohann     CeedChk(ierr);
813241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
814241a4b83SYohann     CeedChk(ierr);
815241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
816241a4b83SYohann     CeedChk(ierr);
8174d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
818241a4b83SYohann     CeedChk(ierr);
819920dcdc4Sjeremylt 
820920dcdc4Sjeremylt     // Set field constants
821920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT) {
822920dcdc4Sjeremylt       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
823920dcdc4Sjeremylt       if (basis != CEED_BASIS_COLLOCATED) {
824920dcdc4Sjeremylt         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
825920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
826920dcdc4Sjeremylt       } else {
827920dcdc4Sjeremylt         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
828920dcdc4Sjeremylt       }
829920dcdc4Sjeremylt       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
830920dcdc4Sjeremylt     }
831920dcdc4Sjeremylt 
832920dcdc4Sjeremylt     // Load basis data
833920dcdc4Sjeremylt     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
834241a4b83SYohann     switch (emode) {
835241a4b83SYohann     case CEED_EVAL_NONE:
836241a4b83SYohann       break;
837241a4b83SYohann     case CEED_EVAL_INTERP:
838241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
839241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
840241a4b83SYohann       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
841241a4b83SYohann       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
842241a4b83SYohann       break;
843241a4b83SYohann     case CEED_EVAL_GRAD:
844241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
845241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
846241a4b83SYohann       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
847241a4b83SYohann       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
848ac421f39SYohann       if (basis_data->d_collograd1d) {
849ac421f39SYohann         data->G.in[i] = basis_data->d_collograd1d;
850ac421f39SYohann         code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
851ac421f39SYohann         code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
852ac421f39SYohann       } else {
853ac421f39SYohann         data->G.in[i] = basis_data->d_grad1d;
854ac421f39SYohann         code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
855241a4b83SYohann         code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
856ac421f39SYohann       }
857241a4b83SYohann       break;
858241a4b83SYohann     case CEED_EVAL_WEIGHT:
859241a4b83SYohann       break; // No action
860241a4b83SYohann     case CEED_EVAL_DIV:
861241a4b83SYohann       break; // TODO: Not implemented
862241a4b83SYohann     case CEED_EVAL_CURL:
863241a4b83SYohann       break; // TODO: Not implemented
864241a4b83SYohann     }
865241a4b83SYohann   }
866920dcdc4Sjeremylt 
867920dcdc4Sjeremylt   code << "\n// Output field constants and basis data\n";
868241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
869920dcdc4Sjeremylt     code << "// -- Output field "<<i<<" --\n";
870241a4b83SYohann     // Get elemsize, emode, ncomp
871241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
872241a4b83SYohann     CeedChk(ierr);
873241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
874241a4b83SYohann     CeedChk(ierr);
875241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
876241a4b83SYohann     CeedChk(ierr);
8774d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
878241a4b83SYohann     CeedChk(ierr);
879920dcdc4Sjeremylt 
880920dcdc4Sjeremylt     // Set field constants
881241a4b83SYohann     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
882920dcdc4Sjeremylt     if (basis != CEED_BASIS_COLLOCATED) {
883241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
884241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
885920dcdc4Sjeremylt     } else {
886920dcdc4Sjeremylt       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
887920dcdc4Sjeremylt     }
888920dcdc4Sjeremylt     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
889920dcdc4Sjeremylt 
890920dcdc4Sjeremylt     // Load basis data
891920dcdc4Sjeremylt     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
892920dcdc4Sjeremylt     switch (emode) {
893920dcdc4Sjeremylt     case CEED_EVAL_NONE:
894920dcdc4Sjeremylt       break; // No action
895920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
896241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
897241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
898241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
899241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
900241a4b83SYohann       break;
901241a4b83SYohann     case CEED_EVAL_GRAD:
902241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
903241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
904241a4b83SYohann       code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
905241a4b83SYohann       code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
906ac421f39SYohann       if (basis_data->d_collograd1d) {
907ac421f39SYohann         data->G.out[i] = basis_data->d_collograd1d;
908ac421f39SYohann         code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
909ac421f39SYohann         code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
910ac421f39SYohann       } else {
911ac421f39SYohann         data->G.out[i] = basis_data->d_grad1d;
912ac421f39SYohann         code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
913241a4b83SYohann         code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
914ac421f39SYohann       }
915241a4b83SYohann       break;
916241a4b83SYohann     case CEED_EVAL_WEIGHT: {
917241a4b83SYohann       Ceed ceed;
918241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
919241a4b83SYohann       return CeedError(ceed, 1,
920241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
921241a4b83SYohann       break; // Should not occur
922241a4b83SYohann     }
923241a4b83SYohann     case CEED_EVAL_DIV:
924241a4b83SYohann       break; // TODO: Not implemented
925241a4b83SYohann     case CEED_EVAL_CURL:
926241a4b83SYohann       break; // TODO: Not implemented
927241a4b83SYohann     }
928241a4b83SYohann   }
929ac421f39SYohann   code << "\n";
930ac421f39SYohann   code << "__syncthreads();\n";
931241a4b83SYohann   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
932241a4b83SYohann   // Input basis apply if needed
933ac421f39SYohann   // Generate the correct eval mode code for each input
934920dcdc4Sjeremylt   code << "\n// Input field restrictions and basis actions\n";
935241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
936920dcdc4Sjeremylt     code << "  // -- Input field "<<i<<" --\n";
937241a4b83SYohann     // Get elemsize, emode, ncomp
938241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
939241a4b83SYohann     CeedChk(ierr);
940241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
941241a4b83SYohann     CeedChk(ierr);
942241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
943241a4b83SYohann     CeedChk(ierr);
9444d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
945241a4b83SYohann     CeedChk(ierr);
946920dcdc4Sjeremylt 
947920dcdc4Sjeremylt     // Restriction
948920dcdc4Sjeremylt     if (emode != CEED_EVAL_WEIGHT &&
949920dcdc4Sjeremylt         !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) {
950241a4b83SYohann       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
951241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
952241a4b83SYohann       data->indices.in[i] = restr_data->d_ind;
953f2b2a896Sjeremylt       if (data->indices.in[i]) {
954*5c7b696cSjeremylt         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
955*5c7b696cSjeremylt         CeedChk(ierr);
956*5c7b696cSjeremylt         code << "  const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
957*5c7b696cSjeremylt         CeedInt compstride;
958*5c7b696cSjeremylt         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
959*5c7b696cSjeremylt         code << "  // CompStride: "<<compstride<<"\n";
960*5c7b696cSjeremylt         code << "  readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
961ccedf6b0Sjeremylt       } else {
962920dcdc4Sjeremylt         bool backendstrides;
963920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict,
964920dcdc4Sjeremylt                                                           &backendstrides);
965920dcdc4Sjeremylt         CeedChk(ierr);
966920dcdc4Sjeremylt         CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
967920dcdc4Sjeremylt         if (!backendstrides) {
968920dcdc4Sjeremylt           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
969920dcdc4Sjeremylt           CeedChk(ierr);
970ccedf6b0Sjeremylt         }
971920dcdc4Sjeremylt         code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
972d80fc06aSjeremylt         code << "  readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
973920dcdc4Sjeremylt       }
974920dcdc4Sjeremylt     }
975920dcdc4Sjeremylt 
976920dcdc4Sjeremylt     // Basis action
977920dcdc4Sjeremylt     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
978920dcdc4Sjeremylt     switch (emode) {
979920dcdc4Sjeremylt     case CEED_EVAL_NONE:
980920dcdc4Sjeremylt       if (!basis_data->d_collograd1d) {
981920dcdc4Sjeremylt         code << "  CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
982920dcdc4Sjeremylt       }
983920dcdc4Sjeremylt       break;
984920dcdc4Sjeremylt     case CEED_EVAL_INTERP:
985241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
986241a4b83SYohann       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
987241a4b83SYohann       break;
988241a4b83SYohann     case CEED_EVAL_GRAD:
989ac421f39SYohann       if (basis_data->d_collograd1d) {
990ac421f39SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
991ac421f39SYohann         code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
992ac421f39SYohann       } else {
993241a4b83SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
994241a4b83SYohann         code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
995ac421f39SYohann       }
996241a4b83SYohann       break;
997241a4b83SYohann     case CEED_EVAL_WEIGHT:
998241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
999241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1000241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1001241a4b83SYohann       data->W = basis_data->d_qweight1d;
1002241a4b83SYohann       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1003241a4b83SYohann       break; // No action
1004241a4b83SYohann     case CEED_EVAL_DIV:
1005241a4b83SYohann       break; // TODO: Not implemented
1006241a4b83SYohann     case CEED_EVAL_CURL:
1007241a4b83SYohann       break; // TODO: Not implemented
1008241a4b83SYohann     }
1009241a4b83SYohann   }
1010ac421f39SYohann 
1011241a4b83SYohann   // Q function
1012920dcdc4Sjeremylt   code << "\n// Output field setup\n";
1013241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1014920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1015241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1016241a4b83SYohann     CeedChk(ierr);
1017241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
1018241a4b83SYohann     {
1019ac421f39SYohann       if (basis_data->d_collograd1d) {
1020ac421f39SYohann         //Accumulator for gradient slices
1021ac421f39SYohann         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1022ac421f39SYohann         code << "  for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1023ac421f39SYohann         code << "    for (CeedInt j = 0; j < Q1d; ++j) {\n";
1024ac421f39SYohann         code << "      r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1025ac421f39SYohann         code << "    }\n";
1026ac421f39SYohann         code << "  }\n";
1027ac421f39SYohann       } else {
1028241a4b83SYohann         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1029241a4b83SYohann       }
1030ac421f39SYohann     }
1031241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1032241a4b83SYohann     {
1033241a4b83SYohann       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1034241a4b83SYohann     }
1035241a4b83SYohann   }
1036ac421f39SYohann   //We treat quadrature points per slice in 3d to save registers
1037ac421f39SYohann   if (basis_data->d_collograd1d) {
1038920dcdc4Sjeremylt     code << "\n  // Note: Collocated Gradient\n";
1039ac421f39SYohann     code << "#pragma unroll\n";
1040ac421f39SYohann     code << "for (CeedInt q=0; q<Q1d; q++) {\n";
1041ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1042920dcdc4Sjeremylt       code << "  // -- Input field "<<i<<" --\n";
1043ac421f39SYohann       // Get elemsize, emode, ncomp
1044ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1045ac421f39SYohann       CeedChk(ierr);
1046ac421f39SYohann       // Basis action
1047920dcdc4Sjeremylt       code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1048ac421f39SYohann       switch (emode) {
1049ac421f39SYohann       case CEED_EVAL_NONE:
1050ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1051920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1052920dcdc4Sjeremylt         data->indices.in[i] = restr_data->d_ind;
1053920dcdc4Sjeremylt         if (data->indices.in[i]) {
1054*5c7b696cSjeremylt           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1055*5c7b696cSjeremylt           CeedChk(ierr);
1056*5c7b696cSjeremylt           code << "  const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1057*5c7b696cSjeremylt           CeedInt compstride;
1058*5c7b696cSjeremylt           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1059*5c7b696cSjeremylt           code << "  // CompStride: "<<compstride<<"\n";
1060*5c7b696cSjeremylt           code << "  readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1061920dcdc4Sjeremylt         } else {
1062920dcdc4Sjeremylt           bool backendstrides;
1063920dcdc4Sjeremylt           ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict,
1064920dcdc4Sjeremylt                                                             &backendstrides);
1065920dcdc4Sjeremylt           CeedChk(ierr);
1066920dcdc4Sjeremylt           CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1067920dcdc4Sjeremylt           if (!backendstrides) {
1068920dcdc4Sjeremylt             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1069920dcdc4Sjeremylt             CeedChk(ierr);
1070920dcdc4Sjeremylt           }
1071920dcdc4Sjeremylt           code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1072d80fc06aSjeremylt           code << "  readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1073920dcdc4Sjeremylt         }
1074ac421f39SYohann         break;
1075ac421f39SYohann       case CEED_EVAL_INTERP:
1076ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1077ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1078ac421f39SYohann         code << "    r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1079ac421f39SYohann         code << "  }\n";
1080ac421f39SYohann         break;
1081ac421f39SYohann       case CEED_EVAL_GRAD:
1082ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1083ac421f39SYohann         code << "  gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1084ac421f39SYohann         break;
1085ac421f39SYohann       case CEED_EVAL_WEIGHT:
1086ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[1];\n";
1087ac421f39SYohann         code << "  r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1088ac421f39SYohann         break; // No action
1089ac421f39SYohann       case CEED_EVAL_DIV:
1090ac421f39SYohann         break; // TODO: Not implemented
1091ac421f39SYohann       case CEED_EVAL_CURL:
1092ac421f39SYohann         break; // TODO: Not implemented
1093ac421f39SYohann       }
1094ac421f39SYohann     }
1095ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1096920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1097ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1098ac421f39SYohann       CeedChk(ierr);
1099ac421f39SYohann       // Basis action
1100ac421f39SYohann       switch (emode) {
1101ac421f39SYohann       case CEED_EVAL_NONE:
1102ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1103ac421f39SYohann         break; // No action
1104ac421f39SYohann       case CEED_EVAL_INTERP:
1105ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1106ac421f39SYohann         break;
1107ac421f39SYohann       case CEED_EVAL_GRAD:
1108ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1109ac421f39SYohann         break;
1110ac421f39SYohann       case CEED_EVAL_WEIGHT:
1111ac421f39SYohann         break; // Should not occur
1112ac421f39SYohann       case CEED_EVAL_DIV:
1113ac421f39SYohann         break; // TODO: Not implemented
1114ac421f39SYohann       case CEED_EVAL_CURL:
1115ac421f39SYohann         break; // TODO: Not implemented
1116ac421f39SYohann       }
1117ac421f39SYohann     }
1118ac421f39SYohann   } else {
1119920dcdc4Sjeremylt       code << "\n  // Note: No Collocated Gradient\n";
1120ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1121920dcdc4Sjeremylt       code << "  // -- Input field "<<i<<" --\n";
1122ac421f39SYohann       code << "  CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1123ac421f39SYohann     }
1124ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1125920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1126ac421f39SYohann       code << "  CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1127ac421f39SYohann     }
1128ac421f39SYohann   }
1129920dcdc4Sjeremylt   code << "  // QFunction Inputs and outputs\n";
11304d537eeaSYohann   code << "  CeedScalar* in["<<numinputfields<<"];\n";
11314d537eeaSYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1132920dcdc4Sjeremylt     code << "  // -- Input field "<<i<<" --\n";
1133ac421f39SYohann     code << "  in["<<i<<"] = r_q"<<i<<";\n";
11344d537eeaSYohann   }
11354d537eeaSYohann   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
11364d537eeaSYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1137920dcdc4Sjeremylt     code << "  // -- Output field "<<i<<" --\n";
1138ac421f39SYohann     code << "  out["<<i<<"] = r_qq"<<i<<";\n";
11394d537eeaSYohann   }
1140920dcdc4Sjeremylt   code << "\n  // Apply QFunction\n";
1141241a4b83SYohann   string qFunctionName(qf_data->qFunctionName);
1142ac421f39SYohann   code << "  "<<qFunctionName<<"(ctx, ";
1143ac421f39SYohann   if (dim != 3 || basis_data->d_collograd1d) {
1144ac421f39SYohann     code << "1 ";
1145ac421f39SYohann   } else {
1146ac421f39SYohann     code << "Q1d ";
1147ac421f39SYohann   }
1148ac421f39SYohann   code << ", in, out);\n";
1149ac421f39SYohann   if (basis_data->d_collograd1d) {
1150920dcdc4Sjeremylt     code << "\n  // Note: Collocated Gradient\n";
1151ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1152920dcdc4Sjeremylt       code << "  // -- Output field "<<i<<" --\n";
1153ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1154ac421f39SYohann       CeedChk(ierr);
1155ac421f39SYohann       // Basis action
1156920dcdc4Sjeremylt       code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1157ac421f39SYohann       switch (emode) {
1158ac421f39SYohann       case CEED_EVAL_NONE:
1159ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1160ac421f39SYohann         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1161ac421f39SYohann         code << "  }\n";
1162ac421f39SYohann         break; // No action
1163ac421f39SYohann       case CEED_EVAL_INTERP:
1164ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1165ac421f39SYohann         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1166ac421f39SYohann         code << "  }\n";
1167ac421f39SYohann         break;
1168ac421f39SYohann       case CEED_EVAL_GRAD:
1169ac421f39SYohann         code << "  gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1170ac421f39SYohann         break;
1171ac421f39SYohann       case CEED_EVAL_WEIGHT:
1172ac421f39SYohann         break; // Should not occur
1173ac421f39SYohann       case CEED_EVAL_DIV:
1174ac421f39SYohann         break; // TODO: Not implemented
1175ac421f39SYohann       case CEED_EVAL_CURL:
1176ac421f39SYohann         break; // TODO: Not implemented
1177ac421f39SYohann       }
1178ac421f39SYohann     }
1179ac421f39SYohann     code << "}\n";
1180ac421f39SYohann   }
1181241a4b83SYohann 
1182241a4b83SYohann   // Output basis apply if needed
1183ac421f39SYohann   // Generate the correct eval mode code for each output
1184920dcdc4Sjeremylt   code << "\n// Output field basis action and restrictions\n";
1185241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1186920dcdc4Sjeremylt     code << "  // -- Output field "<<i<<" --\n";
1187241a4b83SYohann     // Get elemsize, emode, ncomp
1188241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1189241a4b83SYohann     CeedChk(ierr);
1190241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1191241a4b83SYohann     CeedChk(ierr);
1192241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1193241a4b83SYohann     CeedChk(ierr);
11944d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1195241a4b83SYohann     CeedChk(ierr);
1196241a4b83SYohann     // Basis action
1197920dcdc4Sjeremylt     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1198241a4b83SYohann     switch (emode) {
1199241a4b83SYohann     case CEED_EVAL_NONE:
1200920dcdc4Sjeremylt       code << "  CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1201241a4b83SYohann       break; // No action
1202241a4b83SYohann     case CEED_EVAL_INTERP:
1203241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1204241a4b83SYohann       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1205241a4b83SYohann       break;
1206241a4b83SYohann     case CEED_EVAL_GRAD:
1207241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1208ac421f39SYohann       if (basis_data->d_collograd1d) {
1209ac421f39SYohann         code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1210ac421f39SYohann       } else {
1211241a4b83SYohann         code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1212ac421f39SYohann       }
1213241a4b83SYohann       break;
1214241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1215241a4b83SYohann       Ceed ceed;
1216241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1217241a4b83SYohann       return CeedError(ceed, 1,
1218241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1219241a4b83SYohann       break; // Should not occur
1220241a4b83SYohann     }
1221241a4b83SYohann     case CEED_EVAL_DIV:
1222241a4b83SYohann       break; // TODO: Not implemented
1223241a4b83SYohann     case CEED_EVAL_CURL:
1224241a4b83SYohann       break; // TODO: Not implemented
1225241a4b83SYohann     }
1226920dcdc4Sjeremylt     // Restriction
1227920dcdc4Sjeremylt     ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1228920dcdc4Sjeremylt     data->indices.out[i] = restr_data->d_ind;
1229920dcdc4Sjeremylt     if (data->indices.out[i]) {
1230*5c7b696cSjeremylt       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1231*5c7b696cSjeremylt       CeedChk(ierr);
1232*5c7b696cSjeremylt       code << "  const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1233*5c7b696cSjeremylt       CeedInt compstride;
1234*5c7b696cSjeremylt       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1235*5c7b696cSjeremylt       code << "  // CompStride: "<<compstride<<"\n";
1236*5c7b696cSjeremylt       code << "  writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1237920dcdc4Sjeremylt     } else {
1238920dcdc4Sjeremylt       bool backendstrides;
1239920dcdc4Sjeremylt       ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict,
1240920dcdc4Sjeremylt                                                         &backendstrides);
1241920dcdc4Sjeremylt       CeedChk(ierr);
1242920dcdc4Sjeremylt       CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1243920dcdc4Sjeremylt       if (!backendstrides) {
1244920dcdc4Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1245920dcdc4Sjeremylt         CeedChk(ierr);
1246920dcdc4Sjeremylt       }
1247920dcdc4Sjeremylt       code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1248d80fc06aSjeremylt       code << "  writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1249920dcdc4Sjeremylt     }
1250241a4b83SYohann   }
1251241a4b83SYohann 
1252241a4b83SYohann   code << "  }\n";
1253241a4b83SYohann   code << "}\n\n";
1254241a4b83SYohann 
1255241a4b83SYohann //  std::cout << code.str();
1256241a4b83SYohann 
1257920dcdc4Sjeremylt   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0);
1258920dcdc4Sjeremylt   CeedChk(ierr);
1259241a4b83SYohann   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1260241a4b83SYohann   CeedChk(ierr);
1261241a4b83SYohann 
1262241a4b83SYohann   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1263241a4b83SYohann 
1264241a4b83SYohann   return 0;
1265241a4b83SYohann }
1266