xref: /libCEED/rust/libceed-sys/c-src/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 61dbc9d25335c1a7855bdb901ed4d560d51eee1f)
1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4241a4b83SYohann //
5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7241a4b83SYohann // element discretizations for exascale applications. For more information and
8241a4b83SYohann // source code availability see http://github.com/ceed.
9241a4b83SYohann //
10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
16241a4b83SYohann #include <ceed-backend.h>
17241a4b83SYohann #include "ceed-cuda-gen.h"
18241a4b83SYohann #include <iostream>
19241a4b83SYohann #include <sstream>
20241a4b83SYohann #include "../cuda/ceed-cuda.h"
21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h"
22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
23241a4b83SYohann 
24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
25241a4b83SYohann __device__ double atomicAdd(double *address, double val) {
26241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
27241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
28241a4b83SYohann   do {
29241a4b83SYohann     assumed = old;
30241a4b83SYohann     old =
31241a4b83SYohann       atomicCAS(address_as_ull, assumed,
32241a4b83SYohann                 __double_as_longlong(val +
33241a4b83SYohann                                      __longlong_as_double(assumed)));
34241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
35241a4b83SYohann     // (since NaN != NaN)
36241a4b83SYohann   } while (assumed != old);
37241a4b83SYohann   return __longlong_as_double(old);
38241a4b83SYohann }
39f1a13f77SYohann Dudouit );
40f1a13f77SYohann Dudouit 
41f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
42f1a13f77SYohann Dudouit 
43f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
44f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
45f1a13f77SYohann Dudouit 
46f1a13f77SYohann Dudouit typedef struct {
47f1a13f77SYohann Dudouit   CeedInt tidx;
48f1a13f77SYohann Dudouit   CeedInt tidy;
49f1a13f77SYohann Dudouit   CeedInt tidz;
50f1a13f77SYohann Dudouit   CeedInt tid;
51f1a13f77SYohann Dudouit   CeedScalar* slice;
52f1a13f77SYohann Dudouit } BackendData;
53241a4b83SYohann 
54241a4b83SYohann template <int P, int Q>
55241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
56ac421f39SYohann   for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
57241a4b83SYohann     B[i] = d_B[i];
58241a4b83SYohann   }
59241a4b83SYohann }
60241a4b83SYohann 
61241a4b83SYohann //****
62241a4b83SYohann // 1D
63241a4b83SYohann template <int NCOMP, int P1d>
648795c945Sjeremylt inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
65241a4b83SYohann   if (data.tidx<P1d)
66241a4b83SYohann   {
674d537eeaSYohann     const CeedInt node = data.tidx;
684d537eeaSYohann     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
69241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
708795c945Sjeremylt       r_u[comp] = d_u[ind + nnodes * comp];
71241a4b83SYohann     }
72241a4b83SYohann   }
73241a4b83SYohann }
74241a4b83SYohann 
75241a4b83SYohann template <int NCOMP, int P1d>
768795c945Sjeremylt inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
77241a4b83SYohann   if (data.tidx<P1d)
78241a4b83SYohann   {
794d537eeaSYohann     const CeedInt node = data.tidx;
804d537eeaSYohann     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
81241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
82241a4b83SYohann       r_u[comp] = d_u[ind * NCOMP + comp];
83241a4b83SYohann     }
84241a4b83SYohann   }
85241a4b83SYohann }
86241a4b83SYohann 
87241a4b83SYohann template <int NCOMP, int Q1d>
88241a4b83SYohann inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
894d537eeaSYohann   const CeedInt node = data.tidx;
904d537eeaSYohann   const CeedInt ind = node + elem * Q1d;
91241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
92241a4b83SYohann     r_u[comp] = d_u[ind + nquads * comp];
93241a4b83SYohann   }
94241a4b83SYohann }
95241a4b83SYohann 
96241a4b83SYohann template <int NCOMP, int Q1d>
97241a4b83SYohann inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
984d537eeaSYohann   const CeedInt node = data.tidx;
994d537eeaSYohann   const CeedInt ind = node + elem * Q1d;
100241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
101241a4b83SYohann     r_u[comp] = d_u[ind * NCOMP + comp];
102241a4b83SYohann   }
103241a4b83SYohann }
104241a4b83SYohann 
105241a4b83SYohann template <int NCOMP, int P1d>
1068795c945Sjeremylt inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
107241a4b83SYohann   if (data.tidx<P1d)
108241a4b83SYohann   {
1094d537eeaSYohann     const CeedInt node = data.tidx;
1104d537eeaSYohann     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
111241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
1128795c945Sjeremylt       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
113241a4b83SYohann     }
114241a4b83SYohann   }
115241a4b83SYohann }
116241a4b83SYohann 
117241a4b83SYohann template <int NCOMP, int P1d>
1188795c945Sjeremylt inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
119241a4b83SYohann   if (data.tidx<P1d)
120241a4b83SYohann   {
1214d537eeaSYohann     const CeedInt node = data.tidx;
1224d537eeaSYohann     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
123241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
124241a4b83SYohann       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
125241a4b83SYohann     }
126241a4b83SYohann   }
127241a4b83SYohann }
128241a4b83SYohann 
129241a4b83SYohann template <int NCOMP, int Q1d>
130241a4b83SYohann inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
1314d537eeaSYohann   const CeedInt node = data.tidx;
1324d537eeaSYohann   const CeedInt ind = node + elem * Q1d;
133241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
134241a4b83SYohann     d_v[ind + nquads * comp] = r_v[comp];
135241a4b83SYohann   }
136241a4b83SYohann }
137241a4b83SYohann 
138241a4b83SYohann template <int NCOMP, int Q1d>
139241a4b83SYohann inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
1404d537eeaSYohann   const CeedInt node = data.tidx;
1414d537eeaSYohann   const CeedInt ind = node + elem * Q1d;
142241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
143241a4b83SYohann     d_v[ind * NCOMP + comp] = r_v[comp];
144241a4b83SYohann   }
145241a4b83SYohann }
146241a4b83SYohann 
147241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
148241a4b83SYohann inline __device__ void ContractX1d(BackendData& data,
149241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
150241a4b83SYohann   data.slice[data.tidx] = *U;
151241a4b83SYohann   __syncthreads();
152241a4b83SYohann   *V = 0.0;
153ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
154241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
155241a4b83SYohann   }
156241a4b83SYohann   __syncthreads();
157241a4b83SYohann }
158241a4b83SYohann 
159241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
160241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data,
161241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
162241a4b83SYohann   data.slice[data.tidx] = *U;
163241a4b83SYohann   __syncthreads();
164241a4b83SYohann   *V = 0.0;
165ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
166241a4b83SYohann     *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
167241a4b83SYohann   }
168241a4b83SYohann   __syncthreads();
169241a4b83SYohann }
170241a4b83SYohann 
171241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
172241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
173241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
174ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
175241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
176241a4b83SYohann   }
177241a4b83SYohann }
178241a4b83SYohann 
179241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
180241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
181241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
182ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
183241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
184241a4b83SYohann   }
185241a4b83SYohann }
186241a4b83SYohann 
187241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
188241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
189241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
190241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
191ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
192241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
193241a4b83SYohann   }
194241a4b83SYohann }
195241a4b83SYohann 
196241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
197241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
198241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
199241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
200ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
201241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
202241a4b83SYohann   }
203241a4b83SYohann }
204241a4b83SYohann 
205241a4b83SYohann //****
206241a4b83SYohann // 2D
207241a4b83SYohann template <int NCOMP, int P1d>
2088795c945Sjeremylt inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
209241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
210241a4b83SYohann   {
2114d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
2124d537eeaSYohann     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
213241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
2148795c945Sjeremylt       r_u[comp] = d_u[ind + nnodes * comp];
215241a4b83SYohann     }
216241a4b83SYohann   }
217241a4b83SYohann }
218241a4b83SYohann 
219241a4b83SYohann template <int NCOMP, int P1d>
2208795c945Sjeremylt inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
221241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
222241a4b83SYohann   {
2234d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
2244d537eeaSYohann     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
225241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
226241a4b83SYohann       r_u[comp] = d_u[ind * NCOMP + comp];
227241a4b83SYohann     }
228241a4b83SYohann   }
229241a4b83SYohann }
230241a4b83SYohann 
231241a4b83SYohann template <int NCOMP, int Q1d>
232241a4b83SYohann inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
2334d537eeaSYohann   const CeedInt node = data.tidx + data.tidy*Q1d;
2344d537eeaSYohann   const CeedInt ind = node + elem * Q1d*Q1d;
235241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
236241a4b83SYohann     r_u[comp] = d_u[ind + nquads * comp];
237241a4b83SYohann   }
238241a4b83SYohann }
239241a4b83SYohann 
240241a4b83SYohann template <int NCOMP, int Q1d>
241241a4b83SYohann inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
2424d537eeaSYohann   const CeedInt node = data.tidx + data.tidy*Q1d;
2434d537eeaSYohann   const CeedInt ind = node + elem * Q1d*Q1d;
244241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
245241a4b83SYohann     r_u[comp] = d_u[ind * NCOMP + comp];
246241a4b83SYohann   }
247241a4b83SYohann }
248241a4b83SYohann 
249241a4b83SYohann template <int NCOMP, int P1d>
2508795c945Sjeremylt inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
251241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
252241a4b83SYohann   {
2534d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
2544d537eeaSYohann     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
255241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
2568795c945Sjeremylt       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
257241a4b83SYohann     }
258241a4b83SYohann   }
259241a4b83SYohann }
260241a4b83SYohann 
261241a4b83SYohann template <int NCOMP, int P1d>
2628795c945Sjeremylt inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
263241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
264241a4b83SYohann   {
2654d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*P1d;
2664d537eeaSYohann     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
267241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
268241a4b83SYohann       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
269241a4b83SYohann     }
270241a4b83SYohann   }
271241a4b83SYohann }
272241a4b83SYohann 
273241a4b83SYohann template <int NCOMP, int Q1d>
274241a4b83SYohann inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
2754d537eeaSYohann   const CeedInt node = data.tidx + data.tidy*Q1d;
2764d537eeaSYohann   const CeedInt ind = node + elem * Q1d*Q1d;
277241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
278241a4b83SYohann     d_v[ind + nquads * comp] = r_v[comp];
279241a4b83SYohann   }
280241a4b83SYohann }
281241a4b83SYohann 
282241a4b83SYohann template <int NCOMP, int Q1d>
283241a4b83SYohann inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
2844d537eeaSYohann   const CeedInt node = data.tidx + data.tidy*Q1d;
2854d537eeaSYohann   const CeedInt ind = node + elem * Q1d*Q1d;
286241a4b83SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
287241a4b83SYohann     d_v[ind * NCOMP + comp] = r_v[comp];
288241a4b83SYohann   }
289241a4b83SYohann }
290241a4b83SYohann 
291241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
292241a4b83SYohann inline __device__ void ContractX2d(BackendData& data,
293241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
294241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
295241a4b83SYohann   __syncthreads();
296241a4b83SYohann   *V = 0.0;
297ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
298241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
299241a4b83SYohann   }
300241a4b83SYohann   __syncthreads();
301241a4b83SYohann }
302241a4b83SYohann 
303241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
304241a4b83SYohann inline __device__ void ContractY2d(BackendData& data,
305241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
306241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
307241a4b83SYohann   __syncthreads();
308241a4b83SYohann   *V = 0.0;
309ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
310241a4b83SYohann     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
311241a4b83SYohann   }
312241a4b83SYohann   __syncthreads();
313241a4b83SYohann }
314241a4b83SYohann 
315241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
316241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data,
317241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
318241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
319241a4b83SYohann   __syncthreads();
320241a4b83SYohann   *V = 0.0;
321241a4b83SYohann   if (data.tidy<P1d) {
322ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
323241a4b83SYohann       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
324241a4b83SYohann     }
325241a4b83SYohann   }
326241a4b83SYohann   __syncthreads();
327241a4b83SYohann }
328241a4b83SYohann 
329241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
330241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data,
331241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
332241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
333241a4b83SYohann   __syncthreads();
334241a4b83SYohann   *V = 0.0;
335241a4b83SYohann   if (data.tidx<P1d) {
336ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
337241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
338241a4b83SYohann     }
339241a4b83SYohann   }
340241a4b83SYohann   __syncthreads();
341241a4b83SYohann }
342241a4b83SYohann 
343241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
344241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data,
345241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
346241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
347241a4b83SYohann   __syncthreads();
348241a4b83SYohann   if (data.tidx<P1d) {
349ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
350241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
351241a4b83SYohann     }
352241a4b83SYohann   }
353241a4b83SYohann   __syncthreads();
354241a4b83SYohann }
355241a4b83SYohann 
356241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
357241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
358241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
359241a4b83SYohann   CeedScalar r_t[1];
360ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
361241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
362241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
363241a4b83SYohann   }
364241a4b83SYohann }
365241a4b83SYohann 
366241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
367241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
368241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
369241a4b83SYohann   CeedScalar r_t[1];
370ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
371241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
372241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
373241a4b83SYohann   }
374241a4b83SYohann }
375241a4b83SYohann 
376241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
377241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
378241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
379241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
380241a4b83SYohann   CeedScalar r_t[1];
381ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
382241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
383241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
384241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
385241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
386241a4b83SYohann   }
387241a4b83SYohann }
388241a4b83SYohann 
389241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
390241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
391241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
392241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
393241a4b83SYohann   CeedScalar r_t[1];
394ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
395241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
396241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
397241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
398241a4b83SYohann     ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
399241a4b83SYohann   }
400241a4b83SYohann }
401241a4b83SYohann 
402241a4b83SYohann //****
403241a4b83SYohann // 3D
404241a4b83SYohann template <int NCOMP, int P1d>
4058795c945Sjeremylt inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
406241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
407241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4084d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4094d537eeaSYohann       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
410241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
4118795c945Sjeremylt         r_u[z+comp*P1d] = d_u[ind + nnodes * comp];
412241a4b83SYohann       }
413241a4b83SYohann     }
414241a4b83SYohann   }
415241a4b83SYohann }
416241a4b83SYohann 
417241a4b83SYohann template <int NCOMP, int P1d>
4188795c945Sjeremylt inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
419241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
420241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4214d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4224d537eeaSYohann       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
423241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
424241a4b83SYohann         r_u[z+comp*P1d] = d_u[ind * NCOMP + comp];
425241a4b83SYohann       }
426241a4b83SYohann     }
427241a4b83SYohann   }
428241a4b83SYohann }
429241a4b83SYohann 
430241a4b83SYohann template <int NCOMP, int Q1d>
431241a4b83SYohann inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
432241a4b83SYohann   for (CeedInt z=0; z < Q1d; ++z) {
4334d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
4344d537eeaSYohann     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
435241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
436241a4b83SYohann       r_u[z+comp*Q1d] = d_u[ind + nquads * comp];
437241a4b83SYohann     }
438241a4b83SYohann   }
439241a4b83SYohann }
440241a4b83SYohann 
441241a4b83SYohann template <int NCOMP, int Q1d>
442ac421f39SYohann inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
443ac421f39SYohann   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
444ac421f39SYohann   const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
445ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
446ac421f39SYohann     r_u[comp] = d_u[ind + nquads * comp];
447ac421f39SYohann   }
448ac421f39SYohann }
449ac421f39SYohann 
450ac421f39SYohann template <int NCOMP, int Q1d>
451241a4b83SYohann inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
452241a4b83SYohann   for (CeedInt z=0; z < Q1d; ++z) {
4534d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
4544d537eeaSYohann     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
455241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
456241a4b83SYohann       r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp];
457241a4b83SYohann     }
458241a4b83SYohann   }
459241a4b83SYohann }
460241a4b83SYohann 
461ac421f39SYohann template <int NCOMP, int Q1d>
462ac421f39SYohann inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
463ac421f39SYohann   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
464ac421f39SYohann   const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
465ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
466ac421f39SYohann     r_u[comp] = d_u[ind * NCOMP + comp];
467ac421f39SYohann   }
468ac421f39SYohann }
469ac421f39SYohann 
470241a4b83SYohann template <int NCOMP, int P1d>
4718795c945Sjeremylt inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
472241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
473241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4744d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4754d537eeaSYohann       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
476241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
4778795c945Sjeremylt         atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]);
478241a4b83SYohann       }
479241a4b83SYohann     }
480241a4b83SYohann   }
481241a4b83SYohann }
482241a4b83SYohann 
483241a4b83SYohann template <int NCOMP, int P1d>
4848795c945Sjeremylt inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
485241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
486241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
4874d537eeaSYohann       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4884d537eeaSYohann       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
489241a4b83SYohann       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
490241a4b83SYohann         atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]);
491241a4b83SYohann       }
492241a4b83SYohann     }
493241a4b83SYohann   }
494241a4b83SYohann }
495241a4b83SYohann 
496241a4b83SYohann template <int NCOMP, int Q1d>
497241a4b83SYohann inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
498241a4b83SYohann   for (CeedInt z=0; z < Q1d; ++z) {
4994d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
5004d537eeaSYohann     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
501241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
502241a4b83SYohann       d_v[ind + nquads * comp] = r_v[z+comp*Q1d];
503241a4b83SYohann     }
504241a4b83SYohann   }
505241a4b83SYohann }
506241a4b83SYohann 
507241a4b83SYohann template <int NCOMP, int Q1d>
508241a4b83SYohann inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
509241a4b83SYohann   for (CeedInt z=0; z < Q1d; ++z) {
5104d537eeaSYohann     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
5114d537eeaSYohann     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
512241a4b83SYohann     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
513241a4b83SYohann       d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d];
514241a4b83SYohann     }
515241a4b83SYohann   }
516241a4b83SYohann }
517241a4b83SYohann 
518241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
519241a4b83SYohann inline __device__ void ContractX3d(BackendData& data,
520241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
521ac421f39SYohann   CeedScalar r_B[P1d];
522ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
523ac421f39SYohann     r_B[i] = B[i + data.tidx*P1d];
524ac421f39SYohann   }
525ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
526241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
527241a4b83SYohann     __syncthreads();
528241a4b83SYohann     V[k] = 0.0;
529ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
530ac421f39SYohann       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
531241a4b83SYohann     }
532241a4b83SYohann     __syncthreads();
533241a4b83SYohann   }
534241a4b83SYohann }
535241a4b83SYohann 
536241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
537241a4b83SYohann inline __device__ void ContractY3d(BackendData& data,
538241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
539ac421f39SYohann   CeedScalar r_B[P1d];
540ac421f39SYohann   for (CeedInt i = 0; i < P1d; ++i) {
541ac421f39SYohann     r_B[i] = B[i + data.tidy*P1d];
542ac421f39SYohann   }
543ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
544241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
545241a4b83SYohann     __syncthreads();
546241a4b83SYohann     V[k] = 0.0;
547ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
548ac421f39SYohann       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
549241a4b83SYohann     }
550241a4b83SYohann     __syncthreads();
551241a4b83SYohann   }
552241a4b83SYohann }
553241a4b83SYohann 
554241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
555241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data,
556241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
557ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
558241a4b83SYohann     V[k] = 0.0;
559ac421f39SYohann     for (CeedInt i = 0; i < P1d; ++i) {
560241a4b83SYohann       V[k] += B[i + k*P1d] * U[i];//contract z direction
561241a4b83SYohann     }
562241a4b83SYohann   }
563241a4b83SYohann }
564241a4b83SYohann 
565241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
566241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data,
567241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
568ac421f39SYohann   for (CeedInt k = 0; k < Q1d; ++k) {
569241a4b83SYohann     V[k] = 0.0;
570241a4b83SYohann     if (k<P1d) {
571ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
572241a4b83SYohann         V[k] += B[k + i*P1d] * U[i];//contract z direction
573241a4b83SYohann       }
574241a4b83SYohann     }
575241a4b83SYohann   }
576241a4b83SYohann }
577241a4b83SYohann 
578241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
579241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data,
580241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
581ac421f39SYohann   CeedScalar r_B[Q1d];
582ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i)
583ac421f39SYohann   {
584ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
585ac421f39SYohann   }
586ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
587241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
588241a4b83SYohann     __syncthreads();
589241a4b83SYohann     V[k] = 0.0;
590241a4b83SYohann     if (data.tidy<P1d) {
591ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
592ac421f39SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
593ac421f39SYohann       }
594ac421f39SYohann     }
595ac421f39SYohann     __syncthreads();
596ac421f39SYohann   }
597ac421f39SYohann }
598ac421f39SYohann 
599ac421f39SYohann template <int NCOMP, int P1d, int Q1d>
600ac421f39SYohann inline __device__ void ContractTransposeAddY3d(BackendData& data,
601ac421f39SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
602ac421f39SYohann   CeedScalar r_B[Q1d];
603ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
604ac421f39SYohann     r_B[i] = B[data.tidy + i*P1d];
605ac421f39SYohann   }
606ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
607ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
608ac421f39SYohann     __syncthreads();
609ac421f39SYohann     if (data.tidy<P1d) {
610ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
611ac421f39SYohann         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
612241a4b83SYohann       }
613241a4b83SYohann     }
614241a4b83SYohann     __syncthreads();
615241a4b83SYohann   }
616241a4b83SYohann }
617241a4b83SYohann 
618241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
619241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data,
620241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
621ac421f39SYohann   CeedScalar r_B[Q1d];
622ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
623ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
624ac421f39SYohann   }
625ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
626241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
627241a4b83SYohann     __syncthreads();
628241a4b83SYohann     V[k] = 0.0;
629241a4b83SYohann     if (data.tidx<P1d) {
630ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
631ac421f39SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
632241a4b83SYohann       }
633241a4b83SYohann     }
634241a4b83SYohann     __syncthreads();
635241a4b83SYohann   }
636241a4b83SYohann }
637241a4b83SYohann 
638241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
639241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data,
640241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
641ac421f39SYohann   CeedScalar r_B[Q1d];
642ac421f39SYohann   for (CeedInt i = 0; i < Q1d; ++i) {
643ac421f39SYohann     r_B[i] = B[data.tidx + i*P1d];
644ac421f39SYohann   }
645ac421f39SYohann   for (CeedInt k = 0; k < P1d; ++k) {
646241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
647241a4b83SYohann     __syncthreads();
648241a4b83SYohann     if (data.tidx<P1d) {
649ac421f39SYohann       for (CeedInt i = 0; i < Q1d; ++i) {
650ac421f39SYohann         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
651241a4b83SYohann       }
652241a4b83SYohann     }
653241a4b83SYohann     __syncthreads();
654241a4b83SYohann   }
655241a4b83SYohann }
656241a4b83SYohann 
657241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
658241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
659241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
660241a4b83SYohann   CeedScalar r_t1[Q1d];
661241a4b83SYohann   CeedScalar r_t2[Q1d];
662ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
663241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
664241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
665241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
666241a4b83SYohann   }
667241a4b83SYohann }
668241a4b83SYohann 
669241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
670241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
671241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
672241a4b83SYohann   CeedScalar r_t1[Q1d];
673241a4b83SYohann   CeedScalar r_t2[Q1d];
674ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
675241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
676241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
677241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
678241a4b83SYohann   }
679241a4b83SYohann }
680241a4b83SYohann 
681241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
682241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
683241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
684241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
685241a4b83SYohann   CeedScalar r_t1[Q1d];
686241a4b83SYohann   CeedScalar r_t2[Q1d];
687ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
688241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
689241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
690241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
691241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
692241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
693241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
694241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
695241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
696241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
697241a4b83SYohann   }
698241a4b83SYohann }
699241a4b83SYohann 
700241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
701241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
702241a4b83SYohann                                        const CeedScalar *c_B, const CeedScalar *c_G,
703241a4b83SYohann                                        CeedScalar *__restrict__ r_V) {
704241a4b83SYohann   CeedScalar r_t1[Q1d];
705241a4b83SYohann   CeedScalar r_t2[Q1d];
706ac421f39SYohann   for (CeedInt comp=0; comp<NCOMP; comp++) {
707241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
708241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
709241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
710241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
711241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
712241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
713241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
714241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
715241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
716241a4b83SYohann   }
717241a4b83SYohann }
718241a4b83SYohann 
719ac421f39SYohann template <int NCOMP, int Q1d>
720ac421f39SYohann inline __device__ void gradCollo3d(BackendData& data, const CeedInt q,
721ac421f39SYohann                                    const CeedScalar *__restrict__ r_U,
722ac421f39SYohann                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
723ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
724ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d];
725ac421f39SYohann     __syncthreads();
726ac421f39SYohann     // X derivative
727ac421f39SYohann     r_V[comp+0*NCOMP] = 0.0;
728ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
729ac421f39SYohann       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
730ac421f39SYohann     }
731ac421f39SYohann     // Y derivative
732ac421f39SYohann     r_V[comp+1*NCOMP] = 0.0;
733ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
734ac421f39SYohann       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
735ac421f39SYohann     }
736ac421f39SYohann     // Z derivative
737ac421f39SYohann     r_V[comp+2*NCOMP] = 0.0;
738ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
739ac421f39SYohann       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative)
740ac421f39SYohann     }
741ac421f39SYohann     __syncthreads();
742ac421f39SYohann   }
743ac421f39SYohann }
744ac421f39SYohann 
745ac421f39SYohann template <int NCOMP, int Q1d>
746ac421f39SYohann inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q,
747ac421f39SYohann                                             const CeedScalar *__restrict__ r_U,
748ac421f39SYohann                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
749ac421f39SYohann   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
750ac421f39SYohann     // X derivative
751ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP];
752ac421f39SYohann     __syncthreads();
753ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
754ac421f39SYohann       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
755ac421f39SYohann     }
756ac421f39SYohann     __syncthreads();
757ac421f39SYohann     // Y derivative
758ac421f39SYohann     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP];
759ac421f39SYohann     __syncthreads();
760ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
761ac421f39SYohann       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
762ac421f39SYohann     }
763ac421f39SYohann     __syncthreads();
764ac421f39SYohann     // Z derivative
765ac421f39SYohann     for (CeedInt i = 0; i < Q1d; ++i) {
766ac421f39SYohann       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative)
767ac421f39SYohann     }
768ac421f39SYohann   }
769ac421f39SYohann }
770ac421f39SYohann 
771241a4b83SYohann template <int Q1d>
772241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
773241a4b83SYohann   *w = qweight1d[data.tidx];
774241a4b83SYohann }
775241a4b83SYohann 
776241a4b83SYohann template <int Q1d>
777241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
778241a4b83SYohann   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
779241a4b83SYohann }
780241a4b83SYohann 
781241a4b83SYohann template <int Q1d>
782241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
783241a4b83SYohann   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
784ac421f39SYohann   for (CeedInt z = 0; z < Q1d; ++z)
785241a4b83SYohann   {
786241a4b83SYohann     w[z] = pw*qweight1d[z];
787241a4b83SYohann   }
788241a4b83SYohann }
789241a4b83SYohann 
790241a4b83SYohann );
791241a4b83SYohann 
792241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
793241a4b83SYohann 
794241a4b83SYohann 	using std::ostringstream;
795241a4b83SYohann   using std::string;
796241a4b83SYohann   int ierr;
797241a4b83SYohann   bool setupdone;
798241a4b83SYohann   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
799241a4b83SYohann   if (setupdone) return 0;
800241a4b83SYohann   Ceed ceed;
801241a4b83SYohann   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
802241a4b83SYohann   CeedOperator_Cuda_gen *data;
803241a4b83SYohann   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
804241a4b83SYohann   CeedQFunction qf;
805241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
806241a4b83SYohann   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
807241a4b83SYohann   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
8088795c945Sjeremylt   CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes;
809241a4b83SYohann   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
810241a4b83SYohann   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
811241a4b83SYohann   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
812241a4b83SYohann   CeedChk(ierr);
813241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
814241a4b83SYohann   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
815241a4b83SYohann   CeedChk(ierr);
816241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
817241a4b83SYohann   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
818241a4b83SYohann   CeedChk(ierr);
819241a4b83SYohann   CeedEvalMode emode;
820*61dbc9d2Sjeremylt   CeedInterlaceMode imode;
821241a4b83SYohann   CeedBasis basis;
822241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
823241a4b83SYohann   CeedElemRestriction Erestrict;
824241a4b83SYohann   CeedElemRestriction_Cuda_reg *restr_data;
825241a4b83SYohann 
826241a4b83SYohann   ostringstream code;
827241a4b83SYohann   string devFunctions(deviceFunctions);
828241a4b83SYohann 
829f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
830f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
831f1a13f77SYohann Dudouit   Ceed delegate;
832f1a13f77SYohann Dudouit   CeedGetDelegate(ceed, &delegate);
833f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
834f1a13f77SYohann Dudouit   ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr);
835f1a13f77SYohann Dudouit   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
836f1a13f77SYohann Dudouit   if (prop.major<6){
837f1a13f77SYohann Dudouit     code << atomicAdd;
838f1a13f77SYohann Dudouit   }
839f1a13f77SYohann Dudouit 
840241a4b83SYohann   code << devFunctions;
841241a4b83SYohann 
842241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
8434d537eeaSYohann 
8444d537eeaSYohann   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
845ee07ded2SValeria Barra   code << "\n#define CeedPragmaSIMD\n";
846241a4b83SYohann   code << qFunction;
847241a4b83SYohann 
848241a4b83SYohann   // Setup
849241a4b83SYohann   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
850241a4b83SYohann   // Input Evecs and Restriction
851241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
852241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
853241a4b83SYohann     CeedChk(ierr);
854241a4b83SYohann     if (emode == CEED_EVAL_WEIGHT) { // Skip
855241a4b83SYohann     } else {
856241a4b83SYohann       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
857241a4b83SYohann       if (emode != CEED_EVAL_NONE)
858241a4b83SYohann       {
859241a4b83SYohann         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
860241a4b83SYohann         bool isTensor;
861241a4b83SYohann         ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
862241a4b83SYohann         //TODO check that all are the same
863241a4b83SYohann         ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
864241a4b83SYohann         if (isTensor)
865241a4b83SYohann         {
866241a4b83SYohann           //TODO check that all are the same
867241a4b83SYohann           ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
868241a4b83SYohann         } else {
869241a4b83SYohann           return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
870241a4b83SYohann         }
871241a4b83SYohann       }
872241a4b83SYohann     }
873241a4b83SYohann   }
874241a4b83SYohann   data->dim = dim;
875241a4b83SYohann   data->Q1d = Q1d;
876241a4b83SYohann 
877241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
878241a4b83SYohann     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
879241a4b83SYohann   }
880241a4b83SYohann   code << "const CeedInt Dim = "<<dim<<";\n";
881241a4b83SYohann   code << "const CeedInt Q1d = "<<Q1d<<";\n";
882241a4b83SYohann   // code << "const CeedInt Q   = "<<Q<<";\n";
883241a4b83SYohann   code << "extern __shared__ CeedScalar slice[];\n";
884241a4b83SYohann   code << "BackendData data;\n";
885241a4b83SYohann   code << "data.tidx = threadIdx.x;\n";
886241a4b83SYohann   code << "data.tidy = threadIdx.y;\n";
887241a4b83SYohann   code << "data.tidz = threadIdx.z;\n";
888241a4b83SYohann   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
889241a4b83SYohann   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
890ac421f39SYohann   //Initialize constants, and matrices B and G
891241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
892241a4b83SYohann     code << "// Input field "<<i<<"\n";
893241a4b83SYohann     // Get elemsize, emode, ncomp
894241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
895241a4b83SYohann     CeedChk(ierr);
896241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
897241a4b83SYohann     CeedChk(ierr);
898241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
899241a4b83SYohann     CeedChk(ierr);
9004d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
901241a4b83SYohann     CeedChk(ierr);
902241a4b83SYohann     // Basis action
903241a4b83SYohann     switch (emode) {
904241a4b83SYohann     case CEED_EVAL_NONE:
9058795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
906241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
9078795c945Sjeremylt       code << "  const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n";
908241a4b83SYohann       break;
909241a4b83SYohann     case CEED_EVAL_INTERP:
910241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
911241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
9128795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
913241a4b83SYohann       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
914241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
9158795c945Sjeremylt       code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
916241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
917241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
918241a4b83SYohann       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
919241a4b83SYohann       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
920241a4b83SYohann       break;
921241a4b83SYohann     case CEED_EVAL_GRAD:
922241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
9238795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
924241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
9258795c945Sjeremylt       code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
926241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
927241a4b83SYohann       code << "const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
928241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
929241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
930241a4b83SYohann       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
931241a4b83SYohann       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
932ac421f39SYohann       if (basis_data->d_collograd1d) {
933ac421f39SYohann         data->G.in[i] = basis_data->d_collograd1d;
934ac421f39SYohann         code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
935ac421f39SYohann         code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
936ac421f39SYohann       } else {
937ac421f39SYohann         data->G.in[i] = basis_data->d_grad1d;
938ac421f39SYohann         code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
939241a4b83SYohann         code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
940ac421f39SYohann       }
941241a4b83SYohann       break;
942241a4b83SYohann     case CEED_EVAL_WEIGHT:
943241a4b83SYohann       break; // No action
944241a4b83SYohann     case CEED_EVAL_DIV:
945241a4b83SYohann       break; // TODO: Not implemented
946241a4b83SYohann     case CEED_EVAL_CURL:
947241a4b83SYohann       break; // TODO: Not implemented
948241a4b83SYohann     }
949241a4b83SYohann   }
950241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
951241a4b83SYohann     code << "// Output field "<<i<<"\n";
952241a4b83SYohann     // Get elemsize, emode, ncomp
953241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
954241a4b83SYohann     CeedChk(ierr);
955241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
956241a4b83SYohann     CeedChk(ierr);
957241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
958241a4b83SYohann     CeedChk(ierr);
9594d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
960241a4b83SYohann     CeedChk(ierr);
961241a4b83SYohann     // Basis action
962241a4b83SYohann     switch (emode) {
963241a4b83SYohann     case CEED_EVAL_NONE:
964241a4b83SYohann       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
9658795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
966*61dbc9d2Sjeremylt       ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
9678795c945Sjeremylt       code << "  const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n";
968241a4b83SYohann       break; // No action
969241a4b83SYohann     case CEED_EVAL_INTERP:
970241a4b83SYohann       code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
971241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
972241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
973241a4b83SYohann       code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
974241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
975241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
976241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
977241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
9788795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
9798795c945Sjeremylt       code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
980241a4b83SYohann       break;
981241a4b83SYohann     case CEED_EVAL_GRAD:
982241a4b83SYohann       code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
983241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
984241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
985241a4b83SYohann       code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
986241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
987241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
988241a4b83SYohann       code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
989241a4b83SYohann       code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
990ac421f39SYohann       if (basis_data->d_collograd1d) {
991ac421f39SYohann         data->G.out[i] = basis_data->d_collograd1d;
992ac421f39SYohann         code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
993ac421f39SYohann         code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
994ac421f39SYohann       } else {
995ac421f39SYohann         data->G.out[i] = basis_data->d_grad1d;
996ac421f39SYohann         code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
997241a4b83SYohann         code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
998ac421f39SYohann       }
9998795c945Sjeremylt       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
10008795c945Sjeremylt       code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
1001241a4b83SYohann       break;
1002241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1003241a4b83SYohann       Ceed ceed;
1004241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1005241a4b83SYohann       return CeedError(ceed, 1,
1006241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1007241a4b83SYohann       break; // Should not occur
1008241a4b83SYohann     }
1009241a4b83SYohann     case CEED_EVAL_DIV:
1010241a4b83SYohann       break; // TODO: Not implemented
1011241a4b83SYohann     case CEED_EVAL_CURL:
1012241a4b83SYohann       break; // TODO: Not implemented
1013241a4b83SYohann     }
1014241a4b83SYohann   }
1015ac421f39SYohann   code << "\n";
1016ac421f39SYohann   code << "__syncthreads();\n";
1017241a4b83SYohann   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1018241a4b83SYohann   // Input basis apply if needed
1019ac421f39SYohann   // Generate the correct eval mode code for each input
1020241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1021241a4b83SYohann     code << "  // Input field "<<i<<"\n";
1022241a4b83SYohann     // Get elemsize, emode, ncomp
1023241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1024241a4b83SYohann     CeedChk(ierr);
1025241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1026241a4b83SYohann     CeedChk(ierr);
1027241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1028241a4b83SYohann     CeedChk(ierr);
10294d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1030241a4b83SYohann     CeedChk(ierr);
1031241a4b83SYohann     // Basis action
1032241a4b83SYohann     switch (emode) {
1033241a4b83SYohann     case CEED_EVAL_NONE:
1034ac421f39SYohann       if (!basis_data->d_collograd1d) {
1035241a4b83SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1036*61dbc9d2Sjeremylt         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1037*61dbc9d2Sjeremylt         code << "  readQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n";
1038ac421f39SYohann       }
1039241a4b83SYohann       break;
1040241a4b83SYohann     case CEED_EVAL_INTERP:
1041241a4b83SYohann       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1042*61dbc9d2Sjeremylt       ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1043241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1044241a4b83SYohann       data->indices.in[i] = restr_data->d_ind;
1045*61dbc9d2Sjeremylt       code << "  readDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1046241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1047241a4b83SYohann       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1048241a4b83SYohann       break;
1049241a4b83SYohann     case CEED_EVAL_GRAD:
1050241a4b83SYohann       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1051*61dbc9d2Sjeremylt       ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1052241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1053241a4b83SYohann       data->indices.in[i] = restr_data->d_ind;
1054*61dbc9d2Sjeremylt       code << "  readDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1055ac421f39SYohann       if (basis_data->d_collograd1d) {
1056ac421f39SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1057ac421f39SYohann         code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1058ac421f39SYohann       } else {
1059241a4b83SYohann         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1060241a4b83SYohann         code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1061ac421f39SYohann       }
1062241a4b83SYohann       break;
1063241a4b83SYohann     case CEED_EVAL_WEIGHT:
1064241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
1065241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1066241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1067241a4b83SYohann       data->W = basis_data->d_qweight1d;
1068241a4b83SYohann       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1069241a4b83SYohann       break; // No action
1070241a4b83SYohann     case CEED_EVAL_DIV:
1071241a4b83SYohann       break; // TODO: Not implemented
1072241a4b83SYohann     case CEED_EVAL_CURL:
1073241a4b83SYohann       break; // TODO: Not implemented
1074241a4b83SYohann     }
1075241a4b83SYohann   }
1076ac421f39SYohann 
1077241a4b83SYohann   // Q function
1078241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1079ac421f39SYohann       code << "  // Output field "<<i<<"\n";
1080241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1081241a4b83SYohann     CeedChk(ierr);
1082241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
1083241a4b83SYohann     {
1084ac421f39SYohann       if (basis_data->d_collograd1d) {
1085ac421f39SYohann         //Accumulator for gradient slices
1086ac421f39SYohann         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1087ac421f39SYohann         code << "  for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1088ac421f39SYohann         code << "    for (CeedInt j = 0; j < Q1d; ++j) {\n";
1089ac421f39SYohann         code << "      r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1090ac421f39SYohann         code << "    }\n";
1091ac421f39SYohann         code << "  }\n";
1092ac421f39SYohann       } else {
1093241a4b83SYohann         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1094241a4b83SYohann       }
1095ac421f39SYohann     }
1096241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1097241a4b83SYohann     {
1098241a4b83SYohann       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1099241a4b83SYohann     }
1100241a4b83SYohann   }
1101ac421f39SYohann   //We treat quadrature points per slice in 3d to save registers
1102ac421f39SYohann   code << "// QFunction\n";
1103ac421f39SYohann   if (basis_data->d_collograd1d) {
1104ac421f39SYohann     code << "#pragma unroll\n";
1105ac421f39SYohann     code << "for (CeedInt q=0; q<Q1d; q++) {\n";
1106ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1107ac421f39SYohann       code << "  // Input field "<<i<<"\n";
1108ac421f39SYohann       // Get elemsize, emode, ncomp
1109ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1110ac421f39SYohann       CeedChk(ierr);
1111ac421f39SYohann       // Basis action
1112ac421f39SYohann       switch (emode) {
1113ac421f39SYohann       case CEED_EVAL_NONE:
1114ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1115*61dbc9d2Sjeremylt         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1116*61dbc9d2Sjeremylt         code << "  readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1117ac421f39SYohann         break;
1118ac421f39SYohann       case CEED_EVAL_INTERP:
1119ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1120ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1121ac421f39SYohann         code << "    r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1122ac421f39SYohann         code << "  }\n";
1123ac421f39SYohann         break;
1124ac421f39SYohann       case CEED_EVAL_GRAD:
1125ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1126ac421f39SYohann         code << "  gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1127ac421f39SYohann         break;
1128ac421f39SYohann       case CEED_EVAL_WEIGHT:
1129ac421f39SYohann         code << "  CeedScalar r_q"<<i<<"[1];\n";
1130ac421f39SYohann         code << "  r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1131ac421f39SYohann         break; // No action
1132ac421f39SYohann       case CEED_EVAL_DIV:
1133ac421f39SYohann         break; // TODO: Not implemented
1134ac421f39SYohann       case CEED_EVAL_CURL:
1135ac421f39SYohann         break; // TODO: Not implemented
1136ac421f39SYohann       }
1137ac421f39SYohann     }
1138ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1139ac421f39SYohann       code << "  // Output field "<<i<<"\n";
1140ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1141ac421f39SYohann       CeedChk(ierr);
1142ac421f39SYohann       // Basis action
1143ac421f39SYohann       switch (emode) {
1144ac421f39SYohann       case CEED_EVAL_NONE:
1145ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1146ac421f39SYohann         break; // No action
1147ac421f39SYohann       case CEED_EVAL_INTERP:
1148ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1149ac421f39SYohann         break;
1150ac421f39SYohann       case CEED_EVAL_GRAD:
1151ac421f39SYohann         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1152ac421f39SYohann         break;
1153ac421f39SYohann       case CEED_EVAL_WEIGHT:
1154ac421f39SYohann         break; // Should not occur
1155ac421f39SYohann       case CEED_EVAL_DIV:
1156ac421f39SYohann         break; // TODO: Not implemented
1157ac421f39SYohann       case CEED_EVAL_CURL:
1158ac421f39SYohann         break; // TODO: Not implemented
1159ac421f39SYohann       }
1160ac421f39SYohann     }
1161ac421f39SYohann   } else {
1162ac421f39SYohann     for (CeedInt i = 0; i < numinputfields; i++) {
1163ac421f39SYohann       code << "  CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1164ac421f39SYohann     }
1165ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1166ac421f39SYohann       code << "  CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1167ac421f39SYohann     }
1168ac421f39SYohann   }
11694d537eeaSYohann   code << "  CeedScalar* in["<<numinputfields<<"];\n";
11704d537eeaSYohann   for (CeedInt i = 0; i < numinputfields; i++) {
1171ac421f39SYohann     code << "  in["<<i<<"] = r_q"<<i<<";\n";
11724d537eeaSYohann   }
11734d537eeaSYohann   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
11744d537eeaSYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1175ac421f39SYohann     code << "  out["<<i<<"] = r_qq"<<i<<";\n";
11764d537eeaSYohann   }
1177241a4b83SYohann   string qFunctionName(qf_data->qFunctionName);
1178ac421f39SYohann   code << "  "<<qFunctionName<<"(ctx, ";
1179ac421f39SYohann   if (dim != 3 || basis_data->d_collograd1d) {
1180ac421f39SYohann     code << "1 ";
1181ac421f39SYohann   }else{
1182ac421f39SYohann     code << "Q1d ";
1183ac421f39SYohann   }
1184ac421f39SYohann   code << ", in, out);\n";
1185ac421f39SYohann   if (basis_data->d_collograd1d) {
1186ac421f39SYohann     for (CeedInt i = 0; i < numoutputfields; i++) {
1187ac421f39SYohann       code << "  // Output field "<<i<<"\n";
1188ac421f39SYohann       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1189ac421f39SYohann       CeedChk(ierr);
1190ac421f39SYohann       // Basis action
1191ac421f39SYohann       switch (emode) {
1192ac421f39SYohann       case CEED_EVAL_NONE:
1193ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1194ac421f39SYohann         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1195ac421f39SYohann         code << "  }\n";
1196ac421f39SYohann         break; // No action
1197ac421f39SYohann       case CEED_EVAL_INTERP:
1198ac421f39SYohann         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1199ac421f39SYohann         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1200ac421f39SYohann         code << "  }\n";
1201ac421f39SYohann         break;
1202ac421f39SYohann       case CEED_EVAL_GRAD:
1203ac421f39SYohann         code << "  gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1204ac421f39SYohann         break;
1205ac421f39SYohann       case CEED_EVAL_WEIGHT:
1206ac421f39SYohann         break; // Should not occur
1207ac421f39SYohann       case CEED_EVAL_DIV:
1208ac421f39SYohann         break; // TODO: Not implemented
1209ac421f39SYohann       case CEED_EVAL_CURL:
1210ac421f39SYohann         break; // TODO: Not implemented
1211ac421f39SYohann       }
1212ac421f39SYohann     }
1213ac421f39SYohann     code << "}\n";
1214ac421f39SYohann   }
1215241a4b83SYohann 
1216241a4b83SYohann   // Output basis apply if needed
1217ac421f39SYohann   // Generate the correct eval mode code for each output
1218241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
1219241a4b83SYohann     code << "  // Output field "<<i<<"\n";
1220241a4b83SYohann     // Get elemsize, emode, ncomp
1221241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1222241a4b83SYohann     CeedChk(ierr);
1223241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1224241a4b83SYohann     CeedChk(ierr);
1225241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1226241a4b83SYohann     CeedChk(ierr);
12274d537eeaSYohann     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1228241a4b83SYohann     CeedChk(ierr);
1229241a4b83SYohann     // Basis action
1230241a4b83SYohann     switch (emode) {
1231241a4b83SYohann     case CEED_EVAL_NONE:
1232*61dbc9d2Sjeremylt       ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1233*61dbc9d2Sjeremylt       code << "  writeQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n";
1234241a4b83SYohann       break; // No action
1235241a4b83SYohann     case CEED_EVAL_INTERP:
1236241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1237241a4b83SYohann       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1238*61dbc9d2Sjeremylt       ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1239241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1240241a4b83SYohann       data->indices.out[i] = restr_data->d_ind;
1241*61dbc9d2Sjeremylt       code << "  writeDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1242241a4b83SYohann       break;
1243241a4b83SYohann     case CEED_EVAL_GRAD:
1244241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1245ac421f39SYohann       if (basis_data->d_collograd1d) {
1246ac421f39SYohann         code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1247ac421f39SYohann       } else {
1248241a4b83SYohann         code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1249ac421f39SYohann       }
1250*61dbc9d2Sjeremylt       ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1251241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1252241a4b83SYohann       data->indices.out[i] = restr_data->d_ind;
1253*61dbc9d2Sjeremylt       code << "  writeDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1254241a4b83SYohann       break;
1255241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1256241a4b83SYohann       Ceed ceed;
1257241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1258241a4b83SYohann       return CeedError(ceed, 1,
1259241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1260241a4b83SYohann       break; // Should not occur
1261241a4b83SYohann     }
1262241a4b83SYohann     case CEED_EVAL_DIV:
1263241a4b83SYohann       break; // TODO: Not implemented
1264241a4b83SYohann     case CEED_EVAL_CURL:
1265241a4b83SYohann       break; // TODO: Not implemented
1266241a4b83SYohann     }
1267241a4b83SYohann   }
1268241a4b83SYohann 
1269241a4b83SYohann   code << "  }\n";
1270241a4b83SYohann   code << "}\n\n";
1271241a4b83SYohann 
1272241a4b83SYohann   // std::cout << code.str();
1273241a4b83SYohann 
1274241a4b83SYohann   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr);
1275241a4b83SYohann   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1276241a4b83SYohann   CeedChk(ierr);
1277241a4b83SYohann 
1278241a4b83SYohann   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1279241a4b83SYohann 
1280241a4b83SYohann   return 0;
1281241a4b83SYohann }
1282