xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision ccedf6b0eedde5bdb26f1d628a64390ea2c01243)
1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4241a4b83SYohann //
5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7241a4b83SYohann // element discretizations for exascale applications. For more information and
8241a4b83SYohann // source code availability see http://github.com/ceed.
9241a4b83SYohann //
10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
16241a4b83SYohann #include <ceed-backend.h>
17241a4b83SYohann #include "ceed-cuda-gen.h"
18241a4b83SYohann #include <iostream>
19241a4b83SYohann #include <sstream>
20241a4b83SYohann #include "../cuda/ceed-cuda.h"
21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h"
22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
23241a4b83SYohann 
24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
25241a4b83SYohann __device__ double atomicAdd(double *address, double val) {
26241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
27241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
28241a4b83SYohann   do {
29241a4b83SYohann     assumed = old;
30241a4b83SYohann     old =
31241a4b83SYohann       atomicCAS(address_as_ull, assumed,
32241a4b83SYohann                 __double_as_longlong(val +
33241a4b83SYohann                                      __longlong_as_double(assumed)));
34241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
35241a4b83SYohann     // (since NaN != NaN)
36241a4b83SYohann   } while (assumed != old);
37241a4b83SYohann   return __longlong_as_double(old);
38241a4b83SYohann }
39f1a13f77SYohann Dudouit );
40f1a13f77SYohann Dudouit 
41f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
42f1a13f77SYohann Dudouit 
43f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
44f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
45f1a13f77SYohann Dudouit 
46f1a13f77SYohann Dudouit typedef struct {
47f1a13f77SYohann Dudouit   CeedInt tidx;
48f1a13f77SYohann Dudouit   CeedInt tidy;
49f1a13f77SYohann Dudouit   CeedInt tidz;
50f1a13f77SYohann Dudouit   CeedInt tid;
51f1a13f77SYohann Dudouit   CeedScalar* slice;
52f1a13f77SYohann Dudouit } BackendData;
53241a4b83SYohann 
54241a4b83SYohann template <int P, int Q>
55241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
56ac421f39SYohann   for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
57241a4b83SYohann     B[i] = d_B[i];
58241a4b83SYohann   }
59241a4b83SYohann }
60241a4b83SYohann 
61241a4b83SYohann //****
62241a4b83SYohann // 1D
63241a4b83SYohann template <int NCOMP, int P1d>
64*ccedf6b0Sjeremylt inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
65241a4b83SYohann   if (data.tidx<P1d)
66241a4b83SYohann   {
674d537eeaSYohann     const CeedInt node = data.tidx;
68*ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
69241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
708795c945Sjeremylt       r_u[comp] = d_u[ind + nnodes * comp];
71241a4b83SYohann     }
72241a4b83SYohann   }
73241a4b83SYohann }
74241a4b83SYohann 
75241a4b83SYohann template <int NCOMP, int P1d>
76*ccedf6b0Sjeremylt inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
77241a4b83SYohann   if (data.tidx<P1d)
78241a4b83SYohann   {
794d537eeaSYohann     const CeedInt node = data.tidx;
80*ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
81241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
82241a4b83SYohann       r_u[comp] = d_u[ind * NCOMP + comp];
83241a4b83SYohann     }
84241a4b83SYohann   }
85241a4b83SYohann }
86*ccedf6b0Sjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
87*ccedf6b0Sjeremylt   if (data.tidx<P1d)
88*ccedf6b0Sjeremylt   {
89*ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
90*ccedf6b0Sjeremylt     const CeedInt ind = node * strides[0] + elem * strides[2];
91*ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
92*ccedf6b0Sjeremylt       r_u[comp] = d_u[ind + comp * strides[1]];
93*ccedf6b0Sjeremylt     }
94*ccedf6b0Sjeremylt   }
95*ccedf6b0Sjeremylt }
96241a4b83SYohann 
97241a4b83SYohann template <int NCOMP, int Q1d>
98241a4b83SYohann inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
994d537eeaSYohann   const CeedInt node = data.tidx;
1004d537eeaSYohann   const CeedInt ind = node + elem * Q1d;
101241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
102241a4b83SYohann     r_u[comp] = d_u[ind + nquads * comp];
103241a4b83SYohann   }
104241a4b83SYohann }
105241a4b83SYohann 
106241a4b83SYohann template <int NCOMP, int Q1d>
107241a4b83SYohann inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
1084d537eeaSYohann   const CeedInt node = data.tidx;
1094d537eeaSYohann   const CeedInt ind = node + elem * Q1d;
110241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
111241a4b83SYohann     r_u[comp] = d_u[ind * NCOMP + comp];
112241a4b83SYohann   }
113241a4b83SYohann }
114241a4b83SYohann 
115241a4b83SYohann template <int NCOMP, int P1d>
116*ccedf6b0Sjeremylt inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
117241a4b83SYohann   if (data.tidx<P1d)
118241a4b83SYohann   {
1194d537eeaSYohann     const CeedInt node = data.tidx;
120*ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
121241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
1228795c945Sjeremylt       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
123241a4b83SYohann     }
124241a4b83SYohann   }
125241a4b83SYohann }
126241a4b83SYohann 
127241a4b83SYohann template <int NCOMP, int P1d>
128*ccedf6b0Sjeremylt inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
129241a4b83SYohann   if (data.tidx<P1d)
130241a4b83SYohann   {
1314d537eeaSYohann     const CeedInt node = data.tidx;
132*ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d];
133241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
134241a4b83SYohann       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
135241a4b83SYohann     }
136241a4b83SYohann   }
137241a4b83SYohann }
138241a4b83SYohann 
139*ccedf6b0Sjeremylt template <int NCOMP, int P1d>
140*ccedf6b0Sjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
141*ccedf6b0Sjeremylt   if (data.tidx<P1d)
142*ccedf6b0Sjeremylt   {
143*ccedf6b0Sjeremylt     const CeedInt node = data.tidx;
144*ccedf6b0Sjeremylt     const CeedInt ind = node * strides[0] + elem * strides[2];
145*ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
146*ccedf6b0Sjeremylt       atomicAdd(&d_v[ind + comp * strides[1]], r_v[comp]);
147*ccedf6b0Sjeremylt     }
148*ccedf6b0Sjeremylt   }
149*ccedf6b0Sjeremylt }
150*ccedf6b0Sjeremylt 
151241a4b83SYohann template <int NCOMP, int Q1d>
152241a4b83SYohann inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
1534d537eeaSYohann   const CeedInt node = data.tidx;
1544d537eeaSYohann   const CeedInt ind = node + elem * Q1d;
155241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
156241a4b83SYohann     d_v[ind + nquads * comp] = r_v[comp];
157241a4b83SYohann   }
158241a4b83SYohann }
159241a4b83SYohann 
160241a4b83SYohann template <int NCOMP, int Q1d>
161241a4b83SYohann inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
1624d537eeaSYohann   const CeedInt node = data.tidx;
1634d537eeaSYohann   const CeedInt ind = node + elem * Q1d;
164241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
165241a4b83SYohann     d_v[ind * NCOMP + comp] = r_v[comp];
166241a4b83SYohann   }
167241a4b83SYohann }
168241a4b83SYohann 
169241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
170241a4b83SYohann inline __device__ void ContractX1d(BackendData& data,
171241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
172241a4b83SYohann   data.slice[data.tidx] = *U;
173241a4b83SYohann   __syncthreads();
174241a4b83SYohann   *V = 0.0;
175ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
176241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
177241a4b83SYohann   }
178241a4b83SYohann   __syncthreads();
179241a4b83SYohann }
180241a4b83SYohann 
181241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
182241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data,
183241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
184241a4b83SYohann   data.slice[data.tidx] = *U;
185241a4b83SYohann   __syncthreads();
186241a4b83SYohann   *V = 0.0;
187ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
188241a4b83SYohann     *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
189241a4b83SYohann   }
190241a4b83SYohann   __syncthreads();
191241a4b83SYohann }
192241a4b83SYohann 
193241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
194241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
195241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
196ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
197241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
198241a4b83SYohann   }
199241a4b83SYohann }
200241a4b83SYohann 
201241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
202241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
203241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
204ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
205241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
206241a4b83SYohann   }
207241a4b83SYohann }
208241a4b83SYohann 
209241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
210241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
211241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
212241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
213ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
214241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
215241a4b83SYohann   }
216241a4b83SYohann }
217241a4b83SYohann 
218241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
219241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
220241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
221241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
222ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
223241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
224241a4b83SYohann   }
225241a4b83SYohann }
226241a4b83SYohann 
227241a4b83SYohann //****
228241a4b83SYohann // 2D
229241a4b83SYohann template <int NCOMP, int P1d>
230*ccedf6b0Sjeremylt inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
231241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
232241a4b83SYohann   {
2334d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
234*ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
235241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
2368795c945Sjeremylt       r_u[comp] = d_u[ind + nnodes * comp];
237241a4b83SYohann     }
238241a4b83SYohann   }
239241a4b83SYohann }
240241a4b83SYohann 
241241a4b83SYohann template <int NCOMP, int P1d>
242*ccedf6b0Sjeremylt inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
243241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
244241a4b83SYohann   {
2454d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
246*ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
247241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
248241a4b83SYohann       r_u[comp] = d_u[ind * NCOMP + comp];
249241a4b83SYohann     }
250241a4b83SYohann   }
251241a4b83SYohann }
252*ccedf6b0Sjeremylt template <int NCOMP, int P1d>
253*ccedf6b0Sjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
254*ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d)
255*ccedf6b0Sjeremylt   {
256*ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
257*ccedf6b0Sjeremylt     const CeedInt ind = node * strides[0] + elem * strides[2];
258*ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
259*ccedf6b0Sjeremylt       r_u[comp] = d_u[ind + comp * strides[1]];
260*ccedf6b0Sjeremylt     }
261*ccedf6b0Sjeremylt   }
262*ccedf6b0Sjeremylt }
263241a4b83SYohann 
264241a4b83SYohann template <int NCOMP, int Q1d>
265241a4b83SYohann inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
2664d537eeaSYohann   const CeedInt node = data.tidx + data.tidy*Q1d;
2674d537eeaSYohann   const CeedInt ind = node + elem * Q1d*Q1d;
268241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
269241a4b83SYohann     r_u[comp] = d_u[ind + nquads * comp];
270241a4b83SYohann   }
271241a4b83SYohann }
272241a4b83SYohann 
273241a4b83SYohann template <int NCOMP, int Q1d>
274241a4b83SYohann inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
2754d537eeaSYohann   const CeedInt node = data.tidx + data.tidy*Q1d;
2764d537eeaSYohann   const CeedInt ind = node + elem * Q1d*Q1d;
277241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
278241a4b83SYohann     r_u[comp] = d_u[ind * NCOMP + comp];
279241a4b83SYohann   }
280241a4b83SYohann }
281241a4b83SYohann 
282241a4b83SYohann template <int NCOMP, int P1d>
283*ccedf6b0Sjeremylt inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
284241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
285241a4b83SYohann   {
2864d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
287*ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
288241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
2898795c945Sjeremylt       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
290241a4b83SYohann     }
291241a4b83SYohann   }
292241a4b83SYohann }
293241a4b83SYohann 
294241a4b83SYohann template <int NCOMP, int P1d>
295*ccedf6b0Sjeremylt inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
296241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
297241a4b83SYohann   {
2984d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
299*ccedf6b0Sjeremylt     const CeedInt ind = indices[node + elem * P1d*P1d];
300241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
301241a4b83SYohann       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
302241a4b83SYohann     }
303241a4b83SYohann   }
304241a4b83SYohann }
305241a4b83SYohann 
306*ccedf6b0Sjeremylt template <int NCOMP, int P1d>
307*ccedf6b0Sjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
308*ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d)
309*ccedf6b0Sjeremylt   {
310*ccedf6b0Sjeremylt     const CeedInt node = data.tidx + data.tidy*P1d;
311*ccedf6b0Sjeremylt     const CeedInt ind = node * strides[0] + elem * strides[2];
312*ccedf6b0Sjeremylt     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
313*ccedf6b0Sjeremylt       atomicAdd(&d_v[ind + comp * strides[1]], r_v[comp]);
314*ccedf6b0Sjeremylt     }
315*ccedf6b0Sjeremylt   }
316*ccedf6b0Sjeremylt }
317*ccedf6b0Sjeremylt 
318241a4b83SYohann template <int NCOMP, int Q1d>
319241a4b83SYohann inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
3204d537eeaSYohann   const CeedInt node = data.tidx + data.tidy*Q1d;
3214d537eeaSYohann   const CeedInt ind = node + elem * Q1d*Q1d;
322241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
323241a4b83SYohann     d_v[ind + nquads * comp] = r_v[comp];
324241a4b83SYohann   }
325241a4b83SYohann }
326241a4b83SYohann 
327241a4b83SYohann template <int NCOMP, int Q1d>
328241a4b83SYohann inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
3294d537eeaSYohann   const CeedInt node = data.tidx + data.tidy*Q1d;
3304d537eeaSYohann   const CeedInt ind = node + elem * Q1d*Q1d;
331241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
332241a4b83SYohann     d_v[ind * NCOMP + comp] = r_v[comp];
333241a4b83SYohann   }
334241a4b83SYohann }
335241a4b83SYohann 
336241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
337241a4b83SYohann inline __device__ void ContractX2d(BackendData& data,
338241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
339241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
340241a4b83SYohann   __syncthreads();
341241a4b83SYohann   *V = 0.0;
342ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
343241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
344241a4b83SYohann   }
345241a4b83SYohann   __syncthreads();
346241a4b83SYohann }
347241a4b83SYohann 
348241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
349241a4b83SYohann inline __device__ void ContractY2d(BackendData& data,
350241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
351241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
352241a4b83SYohann   __syncthreads();
353241a4b83SYohann   *V = 0.0;
354ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
355241a4b83SYohann     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
356241a4b83SYohann   }
357241a4b83SYohann   __syncthreads();
358241a4b83SYohann }
359241a4b83SYohann 
360241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
361241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data,
362241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
363241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
364241a4b83SYohann   __syncthreads();
365241a4b83SYohann   *V = 0.0;
366241a4b83SYohann   if (data.tidy<P1d) {
367ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
368241a4b83SYohann       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
369241a4b83SYohann     }
370241a4b83SYohann   }
371241a4b83SYohann   __syncthreads();
372241a4b83SYohann }
373241a4b83SYohann 
374241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
375241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data,
376241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
377241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
378241a4b83SYohann   __syncthreads();
379241a4b83SYohann   *V = 0.0;
380241a4b83SYohann   if (data.tidx<P1d) {
381ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
382241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
383241a4b83SYohann     }
384241a4b83SYohann   }
385241a4b83SYohann   __syncthreads();
386241a4b83SYohann }
387241a4b83SYohann 
388241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
389241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data,
390241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
391241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
392241a4b83SYohann   __syncthreads();
393241a4b83SYohann   if (data.tidx<P1d) {
394ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
395241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
396241a4b83SYohann     }
397241a4b83SYohann   }
398241a4b83SYohann   __syncthreads();
399241a4b83SYohann }
400241a4b83SYohann 
401241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
402241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
403241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
404241a4b83SYohann   CeedScalar r_t[1];
405ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
406241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
407241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
408241a4b83SYohann   }
409241a4b83SYohann }
410241a4b83SYohann 
411241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
412241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
413241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
414241a4b83SYohann   CeedScalar r_t[1];
415ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
416241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
417241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
418241a4b83SYohann   }
419241a4b83SYohann }
420241a4b83SYohann 
421241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
422241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
423241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
424241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
425241a4b83SYohann   CeedScalar r_t[1];
426ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
427241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
428241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
429241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
430241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
431241a4b83SYohann   }
432241a4b83SYohann }
433241a4b83SYohann 
434241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
435241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
436241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
437241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
438241a4b83SYohann   CeedScalar r_t[1];
439ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
440241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
441241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
442241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
443241a4b83SYohann     ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
444241a4b83SYohann   }
445241a4b83SYohann }
446241a4b83SYohann 
447241a4b83SYohann //****
448241a4b83SYohann // 3D
449241a4b83SYohann template <int NCOMP, int P1d>
450*ccedf6b0Sjeremylt inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
451241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
452241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4534d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
454*ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
455241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
4568795c945Sjeremylt         r_u[z+comp*P1d] = d_u[ind + nnodes * comp];
457241a4b83SYohann       }
458241a4b83SYohann     }
459241a4b83SYohann   }
460241a4b83SYohann }
461241a4b83SYohann 
462241a4b83SYohann template <int NCOMP, int P1d>
463*ccedf6b0Sjeremylt inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
464241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
465241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4664d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
467*ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
468241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
469241a4b83SYohann         r_u[z+comp*P1d] = d_u[ind * NCOMP + comp];
470241a4b83SYohann       }
471241a4b83SYohann     }
472241a4b83SYohann   }
473241a4b83SYohann }
474*ccedf6b0Sjeremylt template <int NCOMP, int P1d>
475*ccedf6b0Sjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
476*ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d) {
477*ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
478*ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
479*ccedf6b0Sjeremylt       const CeedInt ind = node * strides[0] + elem * strides[2];
480*ccedf6b0Sjeremylt       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
481*ccedf6b0Sjeremylt         r_u[z+comp*P1d] = d_u[ind + comp * strides[1]];
482*ccedf6b0Sjeremylt       }
483*ccedf6b0Sjeremylt     }
484*ccedf6b0Sjeremylt   }
485*ccedf6b0Sjeremylt }
486241a4b83SYohann 
487241a4b83SYohann template <int NCOMP, int Q1d>
488241a4b83SYohann inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
489241a4b83SYohann   for (CeedInt z=0; z < Q1d; ++z) {
4904d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
4914d537eeaSYohann     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
492241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
493241a4b83SYohann       r_u[z+comp*Q1d] = d_u[ind + nquads * comp];
494241a4b83SYohann     }
495241a4b83SYohann   }
496241a4b83SYohann }
497241a4b83SYohann 
498241a4b83SYohann template <int NCOMP, int Q1d>
499ac421f39SYohann inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
500ac421f39SYohann   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
501ac421f39SYohann   const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
502ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
503ac421f39SYohann     r_u[comp] = d_u[ind + nquads * comp];
504ac421f39SYohann   }
505ac421f39SYohann }
506ac421f39SYohann 
507ac421f39SYohann template <int NCOMP, int Q1d>
508241a4b83SYohann inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
509241a4b83SYohann   for (CeedInt z=0; z < Q1d; ++z) {
5104d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
5114d537eeaSYohann     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
512241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
513241a4b83SYohann       r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp];
514241a4b83SYohann     }
515241a4b83SYohann   }
516241a4b83SYohann }
517241a4b83SYohann 
518ac421f39SYohann template <int NCOMP, int Q1d>
519ac421f39SYohann inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
520ac421f39SYohann   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
521ac421f39SYohann   const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
522ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
523ac421f39SYohann     r_u[comp] = d_u[ind * NCOMP + comp];
524ac421f39SYohann   }
525ac421f39SYohann }
526ac421f39SYohann 
527241a4b83SYohann template <int NCOMP, int P1d>
528*ccedf6b0Sjeremylt inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
529241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
530241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
5314d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
532*ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
533241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
5348795c945Sjeremylt         atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]);
535241a4b83SYohann       }
536241a4b83SYohann     }
537241a4b83SYohann   }
538241a4b83SYohann }
539241a4b83SYohann 
540241a4b83SYohann template <int NCOMP, int P1d>
541*ccedf6b0Sjeremylt inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
542241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
543241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
5444d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
545*ccedf6b0Sjeremylt       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
546241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
547241a4b83SYohann         atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]);
548241a4b83SYohann       }
549241a4b83SYohann     }
550241a4b83SYohann   }
551241a4b83SYohann }
552241a4b83SYohann 
553*ccedf6b0Sjeremylt template <int NCOMP, int P1d>
554*ccedf6b0Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
555*ccedf6b0Sjeremylt   if (data.tidx<P1d && data.tidy<P1d) {
556*ccedf6b0Sjeremylt     for (CeedInt z = 0; z < P1d; ++z) {
557*ccedf6b0Sjeremylt       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
558*ccedf6b0Sjeremylt       const CeedInt ind = node * strides[0] + elem * strides[2];
559*ccedf6b0Sjeremylt       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
560*ccedf6b0Sjeremylt         atomicAdd(&d_v[ind + comp * strides[1]], r_v[z+comp*P1d]);
561*ccedf6b0Sjeremylt       }
562*ccedf6b0Sjeremylt     }
563*ccedf6b0Sjeremylt   }
564*ccedf6b0Sjeremylt }
565*ccedf6b0Sjeremylt 
566241a4b83SYohann template <int NCOMP, int Q1d>
567241a4b83SYohann inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
568241a4b83SYohann   for (CeedInt z=0; z < Q1d; ++z) {
5694d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
5704d537eeaSYohann     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
571241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
572241a4b83SYohann       d_v[ind + nquads * comp] = r_v[z+comp*Q1d];
573241a4b83SYohann     }
574241a4b83SYohann   }
575241a4b83SYohann }
576241a4b83SYohann 
577241a4b83SYohann template <int NCOMP, int Q1d>
578241a4b83SYohann inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
579241a4b83SYohann   for (CeedInt z=0; z < Q1d; ++z) {
5804d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
5814d537eeaSYohann     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
582241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
583241a4b83SYohann       d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d];
584241a4b83SYohann     }
585241a4b83SYohann   }
586241a4b83SYohann }
587241a4b83SYohann 
588241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
589241a4b83SYohann inline __device__ void ContractX3d(BackendData& data,
590241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
591ac421f39SYohann   CeedScalar r_B[P1d];
592ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
593ac421f39SYohann     r_B[i] = B[i + data.tidx*P1d];
594ac421f39SYohann   }
595ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
596241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
597241a4b83SYohann     __syncthreads();
598241a4b83SYohann     V[k] = 0.0;
599ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
600ac421f39SYohann       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
601241a4b83SYohann     }
602241a4b83SYohann     __syncthreads();
603241a4b83SYohann   }
604241a4b83SYohann }
605241a4b83SYohann 
606241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
607241a4b83SYohann inline __device__ void ContractY3d(BackendData& data,
608241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
609ac421f39SYohann   CeedScalar r_B[P1d];
610ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
611ac421f39SYohann     r_B[i] = B[i + data.tidy*P1d];
612ac421f39SYohann   }
613ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
614241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
615241a4b83SYohann     __syncthreads();
616241a4b83SYohann     V[k] = 0.0;
617ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
618ac421f39SYohann       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
619241a4b83SYohann     }
620241a4b83SYohann     __syncthreads();
621241a4b83SYohann   }
622241a4b83SYohann }
623241a4b83SYohann 
624241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
625241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data,
626241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
627ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
628241a4b83SYohann     V[k] = 0.0;
629ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
630241a4b83SYohann       V[k] += B[i + k*P1d] * U[i];//contract z direction
631241a4b83SYohann     }
632241a4b83SYohann   }
633241a4b83SYohann }
634241a4b83SYohann 
635241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
636241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data,
637241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
638ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
639241a4b83SYohann     V[k] = 0.0;
640241a4b83SYohann     if (k<P1d) {
641ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
642241a4b83SYohann         V[k] += B[k + i*P1d] * U[i];//contract z direction
643241a4b83SYohann       }
644241a4b83SYohann     }
645241a4b83SYohann   }
646241a4b83SYohann }
647241a4b83SYohann 
648241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
649241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data,
650241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
651ac421f39SYohann   CeedScalar r_B[Q1d];
652ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i)
653ac421f39SYohann   {
654ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
655ac421f39SYohann   }
656ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
657241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
658241a4b83SYohann     __syncthreads();
659241a4b83SYohann     V[k] = 0.0;
660241a4b83SYohann     if (data.tidy<P1d) {
661ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
662ac421f39SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
663ac421f39SYohann       }
664ac421f39SYohann     }
665ac421f39SYohann     __syncthreads();
666ac421f39SYohann   }
667ac421f39SYohann }
668ac421f39SYohann 
669ac421f39SYohann template <int NCOMP, int P1d, int Q1d>
670ac421f39SYohann inline __device__ void ContractTransposeAddY3d(BackendData& data,
671ac421f39SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
672ac421f39SYohann   CeedScalar r_B[Q1d];
673ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
674ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
675ac421f39SYohann   }
676ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
677ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
678ac421f39SYohann     __syncthreads();
679ac421f39SYohann     if (data.tidy<P1d) {
680ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
681ac421f39SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
682241a4b83SYohann       }
683241a4b83SYohann     }
684241a4b83SYohann     __syncthreads();
685241a4b83SYohann   }
686241a4b83SYohann }
687241a4b83SYohann 
688241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
689241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data,
690241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
691ac421f39SYohann   CeedScalar r_B[Q1d];
692ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
693ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
694ac421f39SYohann   }
695ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
696241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
697241a4b83SYohann     __syncthreads();
698241a4b83SYohann     V[k] = 0.0;
699241a4b83SYohann     if (data.tidx<P1d) {
700ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
701ac421f39SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
702241a4b83SYohann       }
703241a4b83SYohann     }
704241a4b83SYohann     __syncthreads();
705241a4b83SYohann   }
706241a4b83SYohann }
707241a4b83SYohann 
708241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
709241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data,
710241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
711ac421f39SYohann   CeedScalar r_B[Q1d];
712ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
713ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
714ac421f39SYohann   }
715ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
716241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
717241a4b83SYohann     __syncthreads();
718241a4b83SYohann     if (data.tidx<P1d) {
719ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
720ac421f39SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
721241a4b83SYohann       }
722241a4b83SYohann     }
723241a4b83SYohann     __syncthreads();
724241a4b83SYohann   }
725241a4b83SYohann }
726241a4b83SYohann 
727241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
728241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
729241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
730241a4b83SYohann   CeedScalar r_t1[Q1d];
731241a4b83SYohann   CeedScalar r_t2[Q1d];
732ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
733241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
734241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
735241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
736241a4b83SYohann   }
737241a4b83SYohann }
738241a4b83SYohann 
739241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
740241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
741241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
742241a4b83SYohann   CeedScalar r_t1[Q1d];
743241a4b83SYohann   CeedScalar r_t2[Q1d];
744ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
745241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
746241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
747241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
748241a4b83SYohann   }
749241a4b83SYohann }
750241a4b83SYohann 
751241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
752241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
753241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
754241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
755241a4b83SYohann   CeedScalar r_t1[Q1d];
756241a4b83SYohann   CeedScalar r_t2[Q1d];
757ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
758241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
759241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
760241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
761241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
762241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
763241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
764241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
765241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
766241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
767241a4b83SYohann   }
768241a4b83SYohann }
769241a4b83SYohann 
770241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
771241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
772241a4b83SYohann                                        const CeedScalar *c_B, const CeedScalar *c_G,
773241a4b83SYohann                                        CeedScalar *__restrict__ r_V) {
774241a4b83SYohann   CeedScalar r_t1[Q1d];
775241a4b83SYohann   CeedScalar r_t2[Q1d];
776ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
777241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
778241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
779241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
780241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
781241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
782241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
783241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
784241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
785241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
786241a4b83SYohann   }
787241a4b83SYohann }
788241a4b83SYohann 
789ac421f39SYohann template <int NCOMP, int Q1d>
790ac421f39SYohann inline __device__ void gradCollo3d(BackendData& data, const CeedInt q,
791ac421f39SYohann                                    const CeedScalar *__restrict__ r_U,
792ac421f39SYohann                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
793ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
794ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d];
795ac421f39SYohann     __syncthreads();
796ac421f39SYohann     // X derivative
797ac421f39SYohann     r_V[comp+0*NCOMP] = 0.0;
798ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
799ac421f39SYohann       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
800ac421f39SYohann     }
801ac421f39SYohann     // Y derivative
802ac421f39SYohann     r_V[comp+1*NCOMP] = 0.0;
803ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
804ac421f39SYohann       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
805ac421f39SYohann     }
806ac421f39SYohann     // Z derivative
807ac421f39SYohann     r_V[comp+2*NCOMP] = 0.0;
808ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
809ac421f39SYohann       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative)
810ac421f39SYohann     }
811ac421f39SYohann     __syncthreads();
812ac421f39SYohann   }
813ac421f39SYohann }
814ac421f39SYohann 
815ac421f39SYohann template <int NCOMP, int Q1d>
816ac421f39SYohann inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q,
817ac421f39SYohann                                             const CeedScalar *__restrict__ r_U,
818ac421f39SYohann                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
819ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
820ac421f39SYohann     // X derivative
821ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP];
822ac421f39SYohann     __syncthreads();
823ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
824ac421f39SYohann       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
825ac421f39SYohann     }
826ac421f39SYohann     __syncthreads();
827ac421f39SYohann     // Y derivative
828ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP];
829ac421f39SYohann     __syncthreads();
830ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
831ac421f39SYohann       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
832ac421f39SYohann     }
833ac421f39SYohann     __syncthreads();
834ac421f39SYohann     // Z derivative
835ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
836ac421f39SYohann       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative)
837ac421f39SYohann     }
838ac421f39SYohann   }
839ac421f39SYohann }
840ac421f39SYohann 
841241a4b83SYohann template <int Q1d>
842241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
843241a4b83SYohann   *w = qweight1d[data.tidx];
844241a4b83SYohann }
845241a4b83SYohann 
846241a4b83SYohann template <int Q1d>
847241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
848241a4b83SYohann   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
849241a4b83SYohann }
850241a4b83SYohann 
851241a4b83SYohann template <int Q1d>
852241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
853241a4b83SYohann   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
854ac421f39SYohann   for (CeedInt z = 0; z < Q1d; ++z)
855241a4b83SYohann   {
856241a4b83SYohann     w[z] = pw*qweight1d[z];
857241a4b83SYohann   }
858241a4b83SYohann }
859241a4b83SYohann 
860241a4b83SYohann );
861241a4b83SYohann 
862241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
863241a4b83SYohann 
864241a4b83SYohann 	using std::ostringstream;
865241a4b83SYohann   using std::string;
866241a4b83SYohann   int ierr;
867241a4b83SYohann   bool setupdone;
868241a4b83SYohann   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
869241a4b83SYohann   if (setupdone) return 0;
870241a4b83SYohann   Ceed ceed;
871241a4b83SYohann   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
872241a4b83SYohann   CeedOperator_Cuda_gen *data;
873241a4b83SYohann   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
874241a4b83SYohann   CeedQFunction qf;
875241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
876241a4b83SYohann   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
877241a4b83SYohann   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
8788795c945Sjeremylt   CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes;
879241a4b83SYohann   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
880241a4b83SYohann   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
881241a4b83SYohann   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
882241a4b83SYohann   CeedChk(ierr);
883241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
884241a4b83SYohann   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
885241a4b83SYohann   CeedChk(ierr);
886241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
887241a4b83SYohann   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
888241a4b83SYohann   CeedChk(ierr);
889241a4b83SYohann   CeedEvalMode emode;
89061dbc9d2Sjeremylt   CeedInterlaceMode imode;
891241a4b83SYohann   CeedBasis basis;
892241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
893241a4b83SYohann   CeedElemRestriction Erestrict;
894241a4b83SYohann   CeedElemRestriction_Cuda_reg *restr_data;
895241a4b83SYohann 
896241a4b83SYohann   ostringstream code;
897241a4b83SYohann   string devFunctions(deviceFunctions);
898241a4b83SYohann 
899f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
900f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
901f1a13f77SYohann Dudouit   Ceed delegate;
902f1a13f77SYohann Dudouit   CeedGetDelegate(ceed, &delegate);
903f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
904f1a13f77SYohann Dudouit   ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr);
905f1a13f77SYohann Dudouit   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
906f1a13f77SYohann Dudouit   if (prop.major<6){
907f1a13f77SYohann Dudouit     code << atomicAdd;
908f1a13f77SYohann Dudouit   }
909f1a13f77SYohann Dudouit 
910241a4b83SYohann   code << devFunctions;
911241a4b83SYohann 
912241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
9134d537eeaSYohann 
9144d537eeaSYohann   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
915ee07ded2SValeria Barra   code << "\n#define CeedPragmaSIMD\n";
916241a4b83SYohann   code << qFunction;
917241a4b83SYohann 
918241a4b83SYohann   // Setup
919241a4b83SYohann   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
920241a4b83SYohann   // Input Evecs and Restriction
921241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
922241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
923241a4b83SYohann     CeedChk(ierr);
924241a4b83SYohann     if (emode == CEED_EVAL_WEIGHT) { // Skip
925241a4b83SYohann     } else {
926241a4b83SYohann       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
927241a4b83SYohann       if (emode != CEED_EVAL_NONE)
928241a4b83SYohann       {
929241a4b83SYohann         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
930241a4b83SYohann         bool isTensor;
931241a4b83SYohann         ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
932241a4b83SYohann         //TODO check that all are the same
933241a4b83SYohann         ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
934241a4b83SYohann         if (isTensor)
935241a4b83SYohann         {
936241a4b83SYohann           //TODO check that all are the same
937241a4b83SYohann           ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
938241a4b83SYohann         } else {
939241a4b83SYohann           return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
940241a4b83SYohann         }
941241a4b83SYohann       }
942241a4b83SYohann     }
943241a4b83SYohann   }
944241a4b83SYohann   data->dim = dim;
945241a4b83SYohann   data->Q1d = Q1d;
946241a4b83SYohann 
947241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
948241a4b83SYohann     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
949241a4b83SYohann   }
950241a4b83SYohann   code << "const CeedInt Dim = "<<dim<<";\n";
951241a4b83SYohann   code << "const CeedInt Q1d = "<<Q1d<<";\n";
952241a4b83SYohann   // code << "const CeedInt Q   = "<<Q<<";\n";
953241a4b83SYohann   code << "extern __shared__ CeedScalar slice[];\n";
954241a4b83SYohann   code << "BackendData data;\n";
955241a4b83SYohann   code << "data.tidx = threadIdx.x;\n";
956241a4b83SYohann   code << "data.tidy = threadIdx.y;\n";
957241a4b83SYohann   code << "data.tidz = threadIdx.z;\n";
958241a4b83SYohann   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
959241a4b83SYohann   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
960ac421f39SYohann   //Initialize constants, and matrices B and G
961241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
962241a4b83SYohann     code << "// Input field "<<i<<"\n";
963241a4b83SYohann     // Get elemsize, emode, ncomp
964241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
965241a4b83SYohann     CeedChk(ierr);
966241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
967241a4b83SYohann     CeedChk(ierr);
968241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
969241a4b83SYohann     CeedChk(ierr);
9704d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
971241a4b83SYohann     CeedChk(ierr);
972241a4b83SYohann     // Basis action
973241a4b83SYohann     switch (emode) {
974241a4b83SYohann     case CEED_EVAL_NONE:
9758795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
976241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
9778795c945Sjeremylt       code << "  const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n";
978241a4b83SYohann       break;
979241a4b83SYohann     case CEED_EVAL_INTERP:
980241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
981241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
9828795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
983241a4b83SYohann       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
984241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
9858795c945Sjeremylt       code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
986241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
987241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
988241a4b83SYohann       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
989241a4b83SYohann       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
990241a4b83SYohann       break;
991241a4b83SYohann     case CEED_EVAL_GRAD:
992241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
9938795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
994241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
9958795c945Sjeremylt       code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
996241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
997241a4b83SYohann       code << "const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
998241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
999241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
1000241a4b83SYohann       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
1001241a4b83SYohann       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
1002ac421f39SYohann       if (basis_data->d_collograd1d) {
1003ac421f39SYohann         data->G.in[i] = basis_data->d_collograd1d;
1004ac421f39SYohann         code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
1005ac421f39SYohann         code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
1006ac421f39SYohann       } else {
1007ac421f39SYohann         data->G.in[i] = basis_data->d_grad1d;
1008ac421f39SYohann         code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
1009241a4b83SYohann         code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
1010ac421f39SYohann       }
1011241a4b83SYohann       break;
1012241a4b83SYohann     case CEED_EVAL_WEIGHT:
1013241a4b83SYohann       break; // No action
1014241a4b83SYohann     case CEED_EVAL_DIV:
1015241a4b83SYohann       break; // TODO: Not implemented
1016241a4b83SYohann     case CEED_EVAL_CURL:
1017241a4b83SYohann       break; // TODO: Not implemented
1018241a4b83SYohann     }
1019241a4b83SYohann   }
1020241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1021241a4b83SYohann     code << "// Output field "<<i<<"\n";
1022241a4b83SYohann     // Get elemsize, emode, ncomp
1023241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1024241a4b83SYohann     CeedChk(ierr);
1025241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1026241a4b83SYohann     CeedChk(ierr);
1027241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1028241a4b83SYohann     CeedChk(ierr);
10294d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1030241a4b83SYohann     CeedChk(ierr);
1031241a4b83SYohann     // Basis action
1032241a4b83SYohann     switch (emode) {
1033241a4b83SYohann     case CEED_EVAL_NONE:
1034241a4b83SYohann       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
10358795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
10368795c945Sjeremylt       code << "  const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n";
1037241a4b83SYohann       break; // No action
1038241a4b83SYohann     case CEED_EVAL_INTERP:
1039241a4b83SYohann       code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
1040241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
1041241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
1042241a4b83SYohann       code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
1043241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1044241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
1045241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1046241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
10478795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
10488795c945Sjeremylt       code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
1049241a4b83SYohann       break;
1050241a4b83SYohann     case CEED_EVAL_GRAD:
1051241a4b83SYohann       code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
1052241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
1053241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
1054241a4b83SYohann       code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
1055241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1056241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
1057241a4b83SYohann       code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1058241a4b83SYohann       code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1059ac421f39SYohann       if (basis_data->d_collograd1d) {
1060ac421f39SYohann         data->G.out[i] = basis_data->d_collograd1d;
1061ac421f39SYohann         code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1062ac421f39SYohann         code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1063ac421f39SYohann       } else {
1064ac421f39SYohann         data->G.out[i] = basis_data->d_grad1d;
1065ac421f39SYohann         code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1066241a4b83SYohann         code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1067ac421f39SYohann       }
10688795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
10698795c945Sjeremylt       code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
1070241a4b83SYohann       break;
1071241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1072241a4b83SYohann       Ceed ceed;
1073241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1074241a4b83SYohann       return CeedError(ceed, 1,
1075241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1076241a4b83SYohann       break; // Should not occur
1077241a4b83SYohann     }
1078241a4b83SYohann     case CEED_EVAL_DIV:
1079241a4b83SYohann       break; // TODO: Not implemented
1080241a4b83SYohann     case CEED_EVAL_CURL:
1081241a4b83SYohann       break; // TODO: Not implemented
1082241a4b83SYohann     }
1083241a4b83SYohann   }
1084ac421f39SYohann   code << "\n";
1085ac421f39SYohann   code << "__syncthreads();\n";
1086241a4b83SYohann   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1087241a4b83SYohann   // Input basis apply if needed
1088ac421f39SYohann   // Generate the correct eval mode code for each input
1089241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1090241a4b83SYohann     code << "  // Input field "<<i<<"\n";
1091241a4b83SYohann     // Get elemsize, emode, ncomp
1092241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1093241a4b83SYohann     CeedChk(ierr);
1094241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1095241a4b83SYohann     CeedChk(ierr);
1096241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1097241a4b83SYohann     CeedChk(ierr);
10984d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1099241a4b83SYohann     CeedChk(ierr);
1100241a4b83SYohann     // Basis action
1101241a4b83SYohann     switch (emode) {
1102241a4b83SYohann     case CEED_EVAL_NONE:
1103ac421f39SYohann       if (!basis_data->d_collograd1d) {
1104241a4b83SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
110561dbc9d2Sjeremylt         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
110661dbc9d2Sjeremylt         code << "  readQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n";
1107ac421f39SYohann       }
1108241a4b83SYohann       break;
1109241a4b83SYohann     case CEED_EVAL_INTERP:
1110241a4b83SYohann       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1111241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1112241a4b83SYohann       data->indices.in[i] = restr_data->d_ind;
1113*ccedf6b0Sjeremylt       if (data->indicies.in[i]) {
1114*ccedf6b0Sjeremylt         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1115*ccedf6b0Sjeremylt       } else {
1116*ccedf6b0Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.in[i])); CeedChk(ierr);
1117*ccedf6b0Sjeremylt       }
1118*ccedf6b0Sjeremylt       code << "  readDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, strides.in["<<i<<"], indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1119241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1120241a4b83SYohann       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1121241a4b83SYohann       break;
1122241a4b83SYohann     case CEED_EVAL_GRAD:
1123241a4b83SYohann       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1124241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1125241a4b83SYohann       data->indices.in[i] = restr_data->d_ind;
1126*ccedf6b0Sjeremylt       if (data->indicies.in[i]) {
1127*ccedf6b0Sjeremylt         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1128*ccedf6b0Sjeremylt       } else {
1129*ccedf6b0Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.in[i])); CeedChk(ierr);
1130*ccedf6b0Sjeremylt       }
1131*ccedf6b0Sjeremylt       code << "  readDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, strides.in["<<i<<"], indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1132ac421f39SYohann       if (basis_data->d_collograd1d) {
1133ac421f39SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1134ac421f39SYohann         code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1135ac421f39SYohann       } else {
1136241a4b83SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1137241a4b83SYohann         code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1138ac421f39SYohann       }
1139241a4b83SYohann       break;
1140241a4b83SYohann     case CEED_EVAL_WEIGHT:
1141241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
1142241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1143241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1144241a4b83SYohann       data->W = basis_data->d_qweight1d;
1145241a4b83SYohann       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1146241a4b83SYohann       break; // No action
1147241a4b83SYohann     case CEED_EVAL_DIV:
1148241a4b83SYohann       break; // TODO: Not implemented
1149241a4b83SYohann     case CEED_EVAL_CURL:
1150241a4b83SYohann       break; // TODO: Not implemented
1151241a4b83SYohann     }
1152241a4b83SYohann   }
1153ac421f39SYohann 
1154241a4b83SYohann   // Q function
1155241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1156ac421f39SYohann       code << "  // Output field "<<i<<"\n";
1157241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1158241a4b83SYohann     CeedChk(ierr);
1159241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
1160241a4b83SYohann     {
1161ac421f39SYohann       if (basis_data->d_collograd1d) {
1162ac421f39SYohann         //Accumulator for gradient slices
1163ac421f39SYohann         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1164ac421f39SYohann         code << "  for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1165ac421f39SYohann         code << "    for (CeedInt j = 0; j < Q1d; ++j) {\n";
1166ac421f39SYohann         code << "      r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1167ac421f39SYohann         code << "    }\n";
1168ac421f39SYohann         code << "  }\n";
1169ac421f39SYohann       } else {
1170241a4b83SYohann         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1171241a4b83SYohann       }
1172ac421f39SYohann     }
1173241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1174241a4b83SYohann     {
1175241a4b83SYohann       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1176241a4b83SYohann     }
1177241a4b83SYohann   }
1178ac421f39SYohann   //We treat quadrature points per slice in 3d to save registers
1179ac421f39SYohann   code << "// QFunction\n";
1180ac421f39SYohann   if (basis_data->d_collograd1d) {
1181ac421f39SYohann     code << "#pragma unroll\n";
1182ac421f39SYohann     code << "for (CeedInt q=0; q<Q1d; q++) {\n";
1183ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1184ac421f39SYohann       code << "  // Input field "<<i<<"\n";
1185ac421f39SYohann       // Get elemsize, emode, ncomp
1186ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1187ac421f39SYohann       CeedChk(ierr);
1188ac421f39SYohann       // Basis action
1189ac421f39SYohann       switch (emode) {
1190ac421f39SYohann       case CEED_EVAL_NONE:
1191ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
119261dbc9d2Sjeremylt         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
119361dbc9d2Sjeremylt         code << "  readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1194ac421f39SYohann         break;
1195ac421f39SYohann       case CEED_EVAL_INTERP:
1196ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1197ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1198ac421f39SYohann         code << "    r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1199ac421f39SYohann         code << "  }\n";
1200ac421f39SYohann         break;
1201ac421f39SYohann       case CEED_EVAL_GRAD:
1202ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1203ac421f39SYohann         code << "  gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1204ac421f39SYohann         break;
1205ac421f39SYohann       case CEED_EVAL_WEIGHT:
1206ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[1];\n";
1207ac421f39SYohann         code << "  r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1208ac421f39SYohann         break; // No action
1209ac421f39SYohann       case CEED_EVAL_DIV:
1210ac421f39SYohann         break; // TODO: Not implemented
1211ac421f39SYohann       case CEED_EVAL_CURL:
1212ac421f39SYohann         break; // TODO: Not implemented
1213ac421f39SYohann       }
1214ac421f39SYohann     }
1215ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1216ac421f39SYohann       code << "  // Output field "<<i<<"\n";
1217ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1218ac421f39SYohann       CeedChk(ierr);
1219ac421f39SYohann       // Basis action
1220ac421f39SYohann       switch (emode) {
1221ac421f39SYohann       case CEED_EVAL_NONE:
1222ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1223ac421f39SYohann         break; // No action
1224ac421f39SYohann       case CEED_EVAL_INTERP:
1225ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1226ac421f39SYohann         break;
1227ac421f39SYohann       case CEED_EVAL_GRAD:
1228ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1229ac421f39SYohann         break;
1230ac421f39SYohann       case CEED_EVAL_WEIGHT:
1231ac421f39SYohann         break; // Should not occur
1232ac421f39SYohann       case CEED_EVAL_DIV:
1233ac421f39SYohann         break; // TODO: Not implemented
1234ac421f39SYohann       case CEED_EVAL_CURL:
1235ac421f39SYohann         break; // TODO: Not implemented
1236ac421f39SYohann       }
1237ac421f39SYohann     }
1238ac421f39SYohann   } else {
1239ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1240ac421f39SYohann       code << "  CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1241ac421f39SYohann     }
1242ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1243ac421f39SYohann       code << "  CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1244ac421f39SYohann     }
1245ac421f39SYohann   }
12464d537eeaSYohann   code << "  CeedScalar* in["<<numinputfields<<"];\n";
12474d537eeaSYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1248ac421f39SYohann     code << "  in["<<i<<"] = r_q"<<i<<";\n";
12494d537eeaSYohann   }
12504d537eeaSYohann   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
12514d537eeaSYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1252ac421f39SYohann     code << "  out["<<i<<"] = r_qq"<<i<<";\n";
12534d537eeaSYohann   }
1254241a4b83SYohann   string qFunctionName(qf_data->qFunctionName);
1255ac421f39SYohann   code << "  "<<qFunctionName<<"(ctx, ";
1256ac421f39SYohann   if (dim != 3 || basis_data->d_collograd1d) {
1257ac421f39SYohann     code << "1 ";
1258ac421f39SYohann   }else{
1259ac421f39SYohann     code << "Q1d ";
1260ac421f39SYohann   }
1261ac421f39SYohann   code << ", in, out);\n";
1262ac421f39SYohann   if (basis_data->d_collograd1d) {
1263ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1264ac421f39SYohann       code << "  // Output field "<<i<<"\n";
1265ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1266ac421f39SYohann       CeedChk(ierr);
1267ac421f39SYohann       // Basis action
1268ac421f39SYohann       switch (emode) {
1269ac421f39SYohann       case CEED_EVAL_NONE:
1270ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1271ac421f39SYohann         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1272ac421f39SYohann         code << "  }\n";
1273ac421f39SYohann         break; // No action
1274ac421f39SYohann       case CEED_EVAL_INTERP:
1275ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1276ac421f39SYohann         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1277ac421f39SYohann         code << "  }\n";
1278ac421f39SYohann         break;
1279ac421f39SYohann       case CEED_EVAL_GRAD:
1280ac421f39SYohann         code << "  gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1281ac421f39SYohann         break;
1282ac421f39SYohann       case CEED_EVAL_WEIGHT:
1283ac421f39SYohann         break; // Should not occur
1284ac421f39SYohann       case CEED_EVAL_DIV:
1285ac421f39SYohann         break; // TODO: Not implemented
1286ac421f39SYohann       case CEED_EVAL_CURL:
1287ac421f39SYohann         break; // TODO: Not implemented
1288ac421f39SYohann       }
1289ac421f39SYohann     }
1290ac421f39SYohann     code << "}\n";
1291ac421f39SYohann   }
1292241a4b83SYohann 
1293241a4b83SYohann   // Output basis apply if needed
1294ac421f39SYohann   // Generate the correct eval mode code for each output
1295241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1296241a4b83SYohann     code << "  // Output field "<<i<<"\n";
1297241a4b83SYohann     // Get elemsize, emode, ncomp
1298241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1299241a4b83SYohann     CeedChk(ierr);
1300241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1301241a4b83SYohann     CeedChk(ierr);
1302241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1303241a4b83SYohann     CeedChk(ierr);
13044d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1305241a4b83SYohann     CeedChk(ierr);
1306241a4b83SYohann     // Basis action
1307241a4b83SYohann     switch (emode) {
1308241a4b83SYohann     case CEED_EVAL_NONE:
130961dbc9d2Sjeremylt       ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
131061dbc9d2Sjeremylt       code << "  writeQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n";
1311241a4b83SYohann       break; // No action
1312241a4b83SYohann     case CEED_EVAL_INTERP:
1313241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1314241a4b83SYohann       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1315241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1316241a4b83SYohann       data->indices.out[i] = restr_data->d_ind;
1317*ccedf6b0Sjeremylt       if (data->indicies.out[i]) {
1318*ccedf6b0Sjeremylt         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1319*ccedf6b0Sjeremylt       } else {
1320*ccedf6b0Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.out[i])); CeedChk(ierr);
1321*ccedf6b0Sjeremylt       }
1322*ccedf6b0Sjeremylt       code << "  writeDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, strides.out["<<i<<"], indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1323241a4b83SYohann       break;
1324241a4b83SYohann     case CEED_EVAL_GRAD:
1325241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1326ac421f39SYohann       if (basis_data->d_collograd1d) {
1327ac421f39SYohann         code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1328ac421f39SYohann       } else {
1329241a4b83SYohann         code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1330ac421f39SYohann       }
1331241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1332241a4b83SYohann       data->indices.out[i] = restr_data->d_ind;
1333*ccedf6b0Sjeremylt       if (data->indicies.out[i]) {
1334*ccedf6b0Sjeremylt         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1335*ccedf6b0Sjeremylt       } else {
1336*ccedf6b0Sjeremylt         ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.out[i])); CeedChk(ierr);
1337*ccedf6b0Sjeremylt       }
1338*ccedf6b0Sjeremylt       code << "  writeDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, strides.out["<<i<<"], indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1339241a4b83SYohann       break;
1340241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1341241a4b83SYohann       Ceed ceed;
1342241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1343241a4b83SYohann       return CeedError(ceed, 1,
1344241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1345241a4b83SYohann       break; // Should not occur
1346241a4b83SYohann     }
1347241a4b83SYohann     case CEED_EVAL_DIV:
1348241a4b83SYohann       break; // TODO: Not implemented
1349241a4b83SYohann     case CEED_EVAL_CURL:
1350241a4b83SYohann       break; // TODO: Not implemented
1351241a4b83SYohann     }
1352241a4b83SYohann   }
1353241a4b83SYohann 
1354241a4b83SYohann   code << "  }\n";
1355241a4b83SYohann   code << "}\n\n";
1356241a4b83SYohann 
1357241a4b83SYohann   // std::cout << code.str();
1358241a4b83SYohann 
1359241a4b83SYohann   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr);
1360241a4b83SYohann   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1361241a4b83SYohann   CeedChk(ierr);
1362241a4b83SYohann 
1363241a4b83SYohann   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1364241a4b83SYohann 
1365241a4b83SYohann   return 0;
1366241a4b83SYohann }
1367