xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 241a4b83dc9c714eb4bcc90729a46be02322143a)
1*241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2*241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3*241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4*241a4b83SYohann //
5*241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6*241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7*241a4b83SYohann // element discretizations for exascale applications. For more information and
8*241a4b83SYohann // source code availability see http://github.com/ceed.
9*241a4b83SYohann //
10*241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11*241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12*241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13*241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14*241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15*241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
16*241a4b83SYohann #include <ceed-backend.h>
17*241a4b83SYohann #include "ceed-cuda-gen.h"
18*241a4b83SYohann #include <iostream>
19*241a4b83SYohann #include <sstream>
20*241a4b83SYohann #include "../cuda/ceed-cuda.h"
21*241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h"
22*241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
23*241a4b83SYohann 
24*241a4b83SYohann static const char *deviceFunctions = QUOTE(
25*241a4b83SYohann 
26*241a4b83SYohann typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
27*241a4b83SYohann typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
28*241a4b83SYohann 
29*241a4b83SYohann typedef struct {
30*241a4b83SYohann   CeedInt tidx;
31*241a4b83SYohann   CeedInt tidy;
32*241a4b83SYohann   CeedInt tidz;
33*241a4b83SYohann   CeedInt tid;
34*241a4b83SYohann   CeedScalar* slice;
35*241a4b83SYohann } BackendData;
36*241a4b83SYohann 
37*241a4b83SYohann #if __CUDA_ARCH__ < 600
38*241a4b83SYohann __device__ double atomicAdd(double *address, double val) {
39*241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
40*241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
41*241a4b83SYohann   do {
42*241a4b83SYohann     assumed = old;
43*241a4b83SYohann     old =
44*241a4b83SYohann       atomicCAS(address_as_ull, assumed,
45*241a4b83SYohann                 __double_as_longlong(val +
46*241a4b83SYohann                                      __longlong_as_double(assumed)));
47*241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
48*241a4b83SYohann     // (since NaN != NaN)
49*241a4b83SYohann   } while (assumed != old);
50*241a4b83SYohann   return __longlong_as_double(old);
51*241a4b83SYohann }
52*241a4b83SYohann #endif // __CUDA_ARCH__ < 600
53*241a4b83SYohann 
54*241a4b83SYohann template <int P, int Q>
55*241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
56*241a4b83SYohann   for(int i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
57*241a4b83SYohann     B[i] = d_B[i];
58*241a4b83SYohann   }
59*241a4b83SYohann   __syncthreads;
60*241a4b83SYohann }
61*241a4b83SYohann 
62*241a4b83SYohann //****
63*241a4b83SYohann // 1D
64*241a4b83SYohann template <int NCOMP, int P1d>
65*241a4b83SYohann inline __device__ void readDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
66*241a4b83SYohann   if (data.tidx<P1d)
67*241a4b83SYohann   {
68*241a4b83SYohann     const CeedInt dof = data.tidx;
69*241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
70*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
71*241a4b83SYohann       r_u[comp] = d_u[ind + ndofs * comp];
72*241a4b83SYohann     }
73*241a4b83SYohann   }
74*241a4b83SYohann }
75*241a4b83SYohann 
76*241a4b83SYohann template <int NCOMP, int P1d>
77*241a4b83SYohann inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
78*241a4b83SYohann   if (data.tidx<P1d)
79*241a4b83SYohann   {
80*241a4b83SYohann     const CeedInt dof = data.tidx;
81*241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
82*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
83*241a4b83SYohann       r_u[comp] = d_u[ind * NCOMP + comp];
84*241a4b83SYohann     }
85*241a4b83SYohann   }
86*241a4b83SYohann }
87*241a4b83SYohann 
88*241a4b83SYohann template <int NCOMP, int Q1d>
89*241a4b83SYohann inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
90*241a4b83SYohann   const CeedInt dof = data.tidx;
91*241a4b83SYohann   const CeedInt ind = dof + elem * Q1d;
92*241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
93*241a4b83SYohann     r_u[comp] = d_u[ind + nquads * comp];
94*241a4b83SYohann   }
95*241a4b83SYohann }
96*241a4b83SYohann 
97*241a4b83SYohann template <int NCOMP, int Q1d>
98*241a4b83SYohann inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
99*241a4b83SYohann   const CeedInt dof = data.tidx;
100*241a4b83SYohann   const CeedInt ind = dof + elem * Q1d;
101*241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
102*241a4b83SYohann     r_u[comp] = d_u[ind * NCOMP + comp];
103*241a4b83SYohann   }
104*241a4b83SYohann }
105*241a4b83SYohann 
106*241a4b83SYohann template <int NCOMP, int P1d>
107*241a4b83SYohann inline __device__ void writeDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
108*241a4b83SYohann   if (data.tidx<P1d)
109*241a4b83SYohann   {
110*241a4b83SYohann     const CeedInt dof = data.tidx;
111*241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
112*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
113*241a4b83SYohann       atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]);
114*241a4b83SYohann     }
115*241a4b83SYohann   }
116*241a4b83SYohann }
117*241a4b83SYohann 
118*241a4b83SYohann template <int NCOMP, int P1d>
119*241a4b83SYohann inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
120*241a4b83SYohann   if (data.tidx<P1d)
121*241a4b83SYohann   {
122*241a4b83SYohann     const CeedInt dof = data.tidx;
123*241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
124*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
125*241a4b83SYohann       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
126*241a4b83SYohann     }
127*241a4b83SYohann   }
128*241a4b83SYohann }
129*241a4b83SYohann 
130*241a4b83SYohann template <int NCOMP, int Q1d>
131*241a4b83SYohann inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
132*241a4b83SYohann   const CeedInt dof = data.tidx;
133*241a4b83SYohann   const CeedInt ind = dof + elem * Q1d;
134*241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
135*241a4b83SYohann     d_v[ind + nquads * comp] = r_v[comp];
136*241a4b83SYohann   }
137*241a4b83SYohann }
138*241a4b83SYohann 
139*241a4b83SYohann template <int NCOMP, int Q1d>
140*241a4b83SYohann inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
141*241a4b83SYohann   const CeedInt dof = data.tidx;
142*241a4b83SYohann   const CeedInt ind = dof + elem * Q1d;
143*241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
144*241a4b83SYohann     d_v[ind * NCOMP + comp] = r_v[comp];
145*241a4b83SYohann   }
146*241a4b83SYohann }
147*241a4b83SYohann 
148*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
149*241a4b83SYohann inline __device__ void ContractX1d(BackendData& data,
150*241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
151*241a4b83SYohann   data.slice[data.tidx] = *U;
152*241a4b83SYohann   __syncthreads();
153*241a4b83SYohann   *V = 0.0;
154*241a4b83SYohann   for (int i = 0; i < P1d; ++i) {
155*241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
156*241a4b83SYohann   }
157*241a4b83SYohann   __syncthreads();
158*241a4b83SYohann }
159*241a4b83SYohann 
160*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
161*241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data,
162*241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
163*241a4b83SYohann   data.slice[data.tidx] = *U;
164*241a4b83SYohann   __syncthreads();
165*241a4b83SYohann   *V = 0.0;
166*241a4b83SYohann   for (int i = 0; i < Q1d; ++i) {
167*241a4b83SYohann     *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
168*241a4b83SYohann   }
169*241a4b83SYohann   __syncthreads();
170*241a4b83SYohann }
171*241a4b83SYohann 
172*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
173*241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
174*241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
175*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
176*241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
177*241a4b83SYohann   }
178*241a4b83SYohann }
179*241a4b83SYohann 
180*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
181*241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
182*241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
183*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
184*241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
185*241a4b83SYohann   }
186*241a4b83SYohann }
187*241a4b83SYohann 
188*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
189*241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
190*241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
191*241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
192*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
193*241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
194*241a4b83SYohann   }
195*241a4b83SYohann }
196*241a4b83SYohann 
197*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
198*241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
199*241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
200*241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
201*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
202*241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
203*241a4b83SYohann   }
204*241a4b83SYohann }
205*241a4b83SYohann 
206*241a4b83SYohann //****
207*241a4b83SYohann // 2D
208*241a4b83SYohann template <int NCOMP, int P1d>
209*241a4b83SYohann inline __device__ void readDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
210*241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
211*241a4b83SYohann   {
212*241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*P1d;
213*241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
214*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
215*241a4b83SYohann       r_u[comp] = d_u[ind + ndofs * comp];
216*241a4b83SYohann     }
217*241a4b83SYohann   }
218*241a4b83SYohann }
219*241a4b83SYohann 
220*241a4b83SYohann template <int NCOMP, int P1d>
221*241a4b83SYohann inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
222*241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
223*241a4b83SYohann   {
224*241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*P1d;
225*241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
226*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
227*241a4b83SYohann       r_u[comp] = d_u[ind * NCOMP + comp];
228*241a4b83SYohann     }
229*241a4b83SYohann   }
230*241a4b83SYohann }
231*241a4b83SYohann 
232*241a4b83SYohann template <int NCOMP, int Q1d>
233*241a4b83SYohann inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
234*241a4b83SYohann   const CeedInt dof = data.tidx + data.tidy*Q1d;
235*241a4b83SYohann   const CeedInt ind = dof + elem * Q1d*Q1d;
236*241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
237*241a4b83SYohann     r_u[comp] = d_u[ind + nquads * comp];
238*241a4b83SYohann   }
239*241a4b83SYohann }
240*241a4b83SYohann 
241*241a4b83SYohann template <int NCOMP, int Q1d>
242*241a4b83SYohann inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
243*241a4b83SYohann   const CeedInt dof = data.tidx + data.tidy*Q1d;
244*241a4b83SYohann   const CeedInt ind = dof + elem * Q1d*Q1d;
245*241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
246*241a4b83SYohann     r_u[comp] = d_u[ind * NCOMP + comp];
247*241a4b83SYohann   }
248*241a4b83SYohann }
249*241a4b83SYohann 
250*241a4b83SYohann template <int NCOMP, int P1d>
251*241a4b83SYohann inline __device__ void writeDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
252*241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
253*241a4b83SYohann   {
254*241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*P1d;
255*241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
256*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
257*241a4b83SYohann       atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]);
258*241a4b83SYohann     }
259*241a4b83SYohann   }
260*241a4b83SYohann }
261*241a4b83SYohann 
262*241a4b83SYohann template <int NCOMP, int P1d>
263*241a4b83SYohann inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
264*241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
265*241a4b83SYohann   {
266*241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*P1d;
267*241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
268*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
269*241a4b83SYohann       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
270*241a4b83SYohann     }
271*241a4b83SYohann   }
272*241a4b83SYohann }
273*241a4b83SYohann 
274*241a4b83SYohann template <int NCOMP, int Q1d>
275*241a4b83SYohann inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
276*241a4b83SYohann   const CeedInt dof = data.tidx + data.tidy*Q1d;
277*241a4b83SYohann   const CeedInt ind = dof + elem * Q1d*Q1d;
278*241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
279*241a4b83SYohann     d_v[ind + nquads * comp] = r_v[comp];
280*241a4b83SYohann   }
281*241a4b83SYohann }
282*241a4b83SYohann 
283*241a4b83SYohann template <int NCOMP, int Q1d>
284*241a4b83SYohann inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
285*241a4b83SYohann   const CeedInt dof = data.tidx + data.tidy*Q1d;
286*241a4b83SYohann   const CeedInt ind = dof + elem * Q1d*Q1d;
287*241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
288*241a4b83SYohann     d_v[ind * NCOMP + comp] = r_v[comp];
289*241a4b83SYohann   }
290*241a4b83SYohann }
291*241a4b83SYohann 
292*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
293*241a4b83SYohann inline __device__ void ContractX2d(BackendData& data,
294*241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
295*241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
296*241a4b83SYohann   __syncthreads();
297*241a4b83SYohann   *V = 0.0;
298*241a4b83SYohann   for (int i = 0; i < P1d; ++i) {
299*241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
300*241a4b83SYohann   }
301*241a4b83SYohann   __syncthreads();
302*241a4b83SYohann }
303*241a4b83SYohann 
304*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
305*241a4b83SYohann inline __device__ void ContractY2d(BackendData& data,
306*241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
307*241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
308*241a4b83SYohann   __syncthreads();
309*241a4b83SYohann   *V = 0.0;
310*241a4b83SYohann   for (int i = 0; i < P1d; ++i) {
311*241a4b83SYohann     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
312*241a4b83SYohann   }
313*241a4b83SYohann   __syncthreads();
314*241a4b83SYohann }
315*241a4b83SYohann 
316*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
317*241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data,
318*241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
319*241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
320*241a4b83SYohann   __syncthreads();
321*241a4b83SYohann   *V = 0.0;
322*241a4b83SYohann   if (data.tidy<P1d) {
323*241a4b83SYohann     for (int i = 0; i < Q1d; ++i) {
324*241a4b83SYohann       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
325*241a4b83SYohann     }
326*241a4b83SYohann   }
327*241a4b83SYohann   __syncthreads();
328*241a4b83SYohann }
329*241a4b83SYohann 
330*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
331*241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data,
332*241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
333*241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
334*241a4b83SYohann   __syncthreads();
335*241a4b83SYohann   *V = 0.0;
336*241a4b83SYohann   if (data.tidx<P1d) {
337*241a4b83SYohann     for (int i = 0; i < Q1d; ++i) {
338*241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
339*241a4b83SYohann     }
340*241a4b83SYohann   }
341*241a4b83SYohann   __syncthreads();
342*241a4b83SYohann }
343*241a4b83SYohann 
344*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
345*241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data,
346*241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
347*241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
348*241a4b83SYohann   __syncthreads();
349*241a4b83SYohann   if (data.tidx<P1d) {
350*241a4b83SYohann     for (int i = 0; i < Q1d; ++i) {
351*241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
352*241a4b83SYohann     }
353*241a4b83SYohann   }
354*241a4b83SYohann   __syncthreads();
355*241a4b83SYohann }
356*241a4b83SYohann 
357*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
358*241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
359*241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
360*241a4b83SYohann   CeedScalar r_t[1];
361*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
362*241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
363*241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
364*241a4b83SYohann   }
365*241a4b83SYohann }
366*241a4b83SYohann 
367*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
368*241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
369*241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
370*241a4b83SYohann   CeedScalar r_t[1];
371*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
372*241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
373*241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
374*241a4b83SYohann   }
375*241a4b83SYohann }
376*241a4b83SYohann 
377*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
378*241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
379*241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
380*241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
381*241a4b83SYohann   CeedScalar r_t[1];
382*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
383*241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
384*241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
385*241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
386*241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
387*241a4b83SYohann   }
388*241a4b83SYohann }
389*241a4b83SYohann 
390*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
391*241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
392*241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
393*241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
394*241a4b83SYohann   CeedScalar r_t[1];
395*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
396*241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
397*241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
398*241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
399*241a4b83SYohann     ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
400*241a4b83SYohann   }
401*241a4b83SYohann }
402*241a4b83SYohann 
403*241a4b83SYohann //****
404*241a4b83SYohann // 3D
405*241a4b83SYohann template <int NCOMP, int P1d>
406*241a4b83SYohann inline __device__ void readDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
407*241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
408*241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
409*241a4b83SYohann       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
410*241a4b83SYohann       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
411*241a4b83SYohann       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
412*241a4b83SYohann         r_u[z+comp*P1d] = d_u[ind + ndofs * comp];
413*241a4b83SYohann       }
414*241a4b83SYohann     }
415*241a4b83SYohann   }
416*241a4b83SYohann }
417*241a4b83SYohann 
418*241a4b83SYohann template <int NCOMP, int P1d>
419*241a4b83SYohann inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
420*241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
421*241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
422*241a4b83SYohann       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
423*241a4b83SYohann       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
424*241a4b83SYohann       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
425*241a4b83SYohann         r_u[z+comp*P1d] = d_u[ind * NCOMP + comp];
426*241a4b83SYohann       }
427*241a4b83SYohann     }
428*241a4b83SYohann   }
429*241a4b83SYohann }
430*241a4b83SYohann 
431*241a4b83SYohann template <int NCOMP, int Q1d>
432*241a4b83SYohann inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
433*241a4b83SYohann   for(CeedInt z=0; z < Q1d; ++z) {
434*241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
435*241a4b83SYohann     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
436*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
437*241a4b83SYohann       r_u[z+comp*Q1d] = d_u[ind + nquads * comp];
438*241a4b83SYohann     }
439*241a4b83SYohann   }
440*241a4b83SYohann }
441*241a4b83SYohann 
442*241a4b83SYohann template <int NCOMP, int Q1d>
443*241a4b83SYohann inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
444*241a4b83SYohann   for(CeedInt z=0; z < Q1d; ++z) {
445*241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
446*241a4b83SYohann     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
447*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
448*241a4b83SYohann       r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp];
449*241a4b83SYohann     }
450*241a4b83SYohann   }
451*241a4b83SYohann }
452*241a4b83SYohann 
453*241a4b83SYohann template <int NCOMP, int P1d>
454*241a4b83SYohann inline __device__ void writeDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
455*241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
456*241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
457*241a4b83SYohann       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
458*241a4b83SYohann       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
459*241a4b83SYohann       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
460*241a4b83SYohann         atomicAdd(&d_v[ind + ndofs * comp], r_v[z+comp*P1d]);
461*241a4b83SYohann       }
462*241a4b83SYohann     }
463*241a4b83SYohann   }
464*241a4b83SYohann }
465*241a4b83SYohann 
466*241a4b83SYohann template <int NCOMP, int P1d>
467*241a4b83SYohann inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
468*241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
469*241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
470*241a4b83SYohann       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
471*241a4b83SYohann       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
472*241a4b83SYohann       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
473*241a4b83SYohann         atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]);
474*241a4b83SYohann       }
475*241a4b83SYohann     }
476*241a4b83SYohann   }
477*241a4b83SYohann }
478*241a4b83SYohann 
479*241a4b83SYohann template <int NCOMP, int Q1d>
480*241a4b83SYohann inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
481*241a4b83SYohann   for(CeedInt z=0; z < Q1d; ++z) {
482*241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
483*241a4b83SYohann     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
484*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
485*241a4b83SYohann       d_v[ind + nquads * comp] = r_v[z+comp*Q1d];
486*241a4b83SYohann     }
487*241a4b83SYohann   }
488*241a4b83SYohann }
489*241a4b83SYohann 
490*241a4b83SYohann template <int NCOMP, int Q1d>
491*241a4b83SYohann inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
492*241a4b83SYohann   for(CeedInt z=0; z < Q1d; ++z) {
493*241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
494*241a4b83SYohann     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
495*241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
496*241a4b83SYohann       d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d];
497*241a4b83SYohann     }
498*241a4b83SYohann   }
499*241a4b83SYohann }
500*241a4b83SYohann 
501*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
502*241a4b83SYohann inline __device__ void ContractX3d(BackendData& data,
503*241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
504*241a4b83SYohann   for (int k = 0; k < P1d; ++k) {
505*241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
506*241a4b83SYohann     __syncthreads();
507*241a4b83SYohann     V[k] = 0.0;
508*241a4b83SYohann     for (int i = 0; i < P1d; ++i) {
509*241a4b83SYohann       V[k] += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
510*241a4b83SYohann     }
511*241a4b83SYohann     __syncthreads();
512*241a4b83SYohann   }
513*241a4b83SYohann }
514*241a4b83SYohann 
515*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
516*241a4b83SYohann inline __device__ void ContractY3d(BackendData& data,
517*241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
518*241a4b83SYohann   for (int k = 0; k < P1d; ++k) {
519*241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
520*241a4b83SYohann     __syncthreads();
521*241a4b83SYohann     V[k] = 0.0;
522*241a4b83SYohann     for (int i = 0; i < P1d; ++i) {
523*241a4b83SYohann       V[k] += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
524*241a4b83SYohann     }
525*241a4b83SYohann     __syncthreads();
526*241a4b83SYohann   }
527*241a4b83SYohann }
528*241a4b83SYohann 
529*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
530*241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data,
531*241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
532*241a4b83SYohann   for (int k = 0; k < Q1d; ++k) {
533*241a4b83SYohann     V[k] = 0.0;
534*241a4b83SYohann     for (int i = 0; i < P1d; ++i) {
535*241a4b83SYohann       V[k] += B[i + k*P1d] * U[i];//contract z direction
536*241a4b83SYohann     }
537*241a4b83SYohann   }
538*241a4b83SYohann }
539*241a4b83SYohann 
540*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
541*241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data,
542*241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
543*241a4b83SYohann   for (int k = 0; k < Q1d; ++k) {
544*241a4b83SYohann     V[k] = 0.0;
545*241a4b83SYohann     if (k<P1d) {
546*241a4b83SYohann       for (int i = 0; i < Q1d; ++i) {
547*241a4b83SYohann         V[k] += B[k + i*P1d] * U[i];//contract z direction
548*241a4b83SYohann       }
549*241a4b83SYohann     }
550*241a4b83SYohann   }
551*241a4b83SYohann }
552*241a4b83SYohann 
553*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
554*241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data,
555*241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
556*241a4b83SYohann   for (int k = 0; k < P1d; ++k) {
557*241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
558*241a4b83SYohann     __syncthreads();
559*241a4b83SYohann     V[k] = 0.0;
560*241a4b83SYohann     if (data.tidy<P1d) {
561*241a4b83SYohann       for (int i = 0; i < Q1d; ++i) {
562*241a4b83SYohann         V[k] += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
563*241a4b83SYohann       }
564*241a4b83SYohann     }
565*241a4b83SYohann     __syncthreads();
566*241a4b83SYohann   }
567*241a4b83SYohann }
568*241a4b83SYohann 
569*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
570*241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data,
571*241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
572*241a4b83SYohann   for (int k = 0; k < P1d; ++k) {
573*241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
574*241a4b83SYohann     __syncthreads();
575*241a4b83SYohann     V[k] = 0.0;
576*241a4b83SYohann     if (data.tidx<P1d) {
577*241a4b83SYohann       for (int i = 0; i < Q1d; ++i) {
578*241a4b83SYohann         V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
579*241a4b83SYohann       }
580*241a4b83SYohann     }
581*241a4b83SYohann     __syncthreads();
582*241a4b83SYohann   }
583*241a4b83SYohann }
584*241a4b83SYohann 
585*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
586*241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data,
587*241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
588*241a4b83SYohann   for (int k = 0; k < P1d; ++k) {
589*241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
590*241a4b83SYohann     __syncthreads();
591*241a4b83SYohann     if (data.tidx<P1d) {
592*241a4b83SYohann       for (int i = 0; i < Q1d; ++i) {
593*241a4b83SYohann         V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
594*241a4b83SYohann       }
595*241a4b83SYohann     }
596*241a4b83SYohann     __syncthreads();
597*241a4b83SYohann   }
598*241a4b83SYohann }
599*241a4b83SYohann 
600*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
601*241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
602*241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
603*241a4b83SYohann   CeedScalar r_t1[Q1d];
604*241a4b83SYohann   CeedScalar r_t2[Q1d];
605*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
606*241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
607*241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
608*241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
609*241a4b83SYohann   }
610*241a4b83SYohann }
611*241a4b83SYohann 
612*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
613*241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
614*241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
615*241a4b83SYohann   CeedScalar r_t1[Q1d];
616*241a4b83SYohann   CeedScalar r_t2[Q1d];
617*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
618*241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
619*241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
620*241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
621*241a4b83SYohann   }
622*241a4b83SYohann }
623*241a4b83SYohann 
624*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
625*241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
626*241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
627*241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
628*241a4b83SYohann   CeedScalar r_t1[Q1d];
629*241a4b83SYohann   CeedScalar r_t2[Q1d];
630*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
631*241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
632*241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
633*241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
634*241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
635*241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
636*241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
637*241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
638*241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
639*241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
640*241a4b83SYohann   }
641*241a4b83SYohann }
642*241a4b83SYohann 
643*241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
644*241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
645*241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
646*241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
647*241a4b83SYohann   CeedScalar r_t1[Q1d];
648*241a4b83SYohann   CeedScalar r_t2[Q1d];
649*241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
650*241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
651*241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
652*241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
653*241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
654*241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
655*241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
656*241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
657*241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
658*241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
659*241a4b83SYohann   }
660*241a4b83SYohann }
661*241a4b83SYohann 
662*241a4b83SYohann template <int Q1d>
663*241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
664*241a4b83SYohann   *w = qweight1d[data.tidx];
665*241a4b83SYohann }
666*241a4b83SYohann 
667*241a4b83SYohann template <int Q1d>
668*241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
669*241a4b83SYohann   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
670*241a4b83SYohann }
671*241a4b83SYohann 
672*241a4b83SYohann template <int Q1d>
673*241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
674*241a4b83SYohann   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
675*241a4b83SYohann   for (int z = 0; z < Q1d; ++z)
676*241a4b83SYohann   {
677*241a4b83SYohann     w[z] = pw*qweight1d[z];
678*241a4b83SYohann   }
679*241a4b83SYohann }
680*241a4b83SYohann 
681*241a4b83SYohann );
682*241a4b83SYohann 
683*241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
684*241a4b83SYohann 
685*241a4b83SYohann 	using std::ostringstream;
686*241a4b83SYohann   using std::string;
687*241a4b83SYohann   int ierr;
688*241a4b83SYohann   bool setupdone;
689*241a4b83SYohann   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
690*241a4b83SYohann   if (setupdone) return 0;
691*241a4b83SYohann   Ceed ceed;
692*241a4b83SYohann   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
693*241a4b83SYohann   CeedOperator_Cuda_gen *data;
694*241a4b83SYohann   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
695*241a4b83SYohann   CeedQFunction qf;
696*241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
697*241a4b83SYohann   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
698*241a4b83SYohann   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
699*241a4b83SYohann   CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, ndof;
700*241a4b83SYohann   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
701*241a4b83SYohann   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
702*241a4b83SYohann   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
703*241a4b83SYohann   CeedChk(ierr);
704*241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
705*241a4b83SYohann   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
706*241a4b83SYohann   CeedChk(ierr);
707*241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
708*241a4b83SYohann   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
709*241a4b83SYohann   CeedChk(ierr);
710*241a4b83SYohann   CeedEvalMode emode;
711*241a4b83SYohann   CeedTransposeMode lmode;
712*241a4b83SYohann   CeedBasis basis;
713*241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
714*241a4b83SYohann   CeedElemRestriction Erestrict;
715*241a4b83SYohann   CeedElemRestriction_Cuda_reg *restr_data;
716*241a4b83SYohann 
717*241a4b83SYohann   ostringstream code;
718*241a4b83SYohann   string devFunctions(deviceFunctions);
719*241a4b83SYohann 
720*241a4b83SYohann   code << devFunctions;
721*241a4b83SYohann 
722*241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
723*241a4b83SYohann   code << qFunction;
724*241a4b83SYohann 
725*241a4b83SYohann   // Setup
726*241a4b83SYohann   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
727*241a4b83SYohann   // Input Evecs and Restriction
728*241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
729*241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
730*241a4b83SYohann     CeedChk(ierr);
731*241a4b83SYohann     if (emode == CEED_EVAL_WEIGHT) { // Skip
732*241a4b83SYohann     } else {
733*241a4b83SYohann       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
734*241a4b83SYohann       if (emode != CEED_EVAL_NONE)
735*241a4b83SYohann       {
736*241a4b83SYohann         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
737*241a4b83SYohann         bool isTensor;
738*241a4b83SYohann         ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
739*241a4b83SYohann         //TODO check that all are the same
740*241a4b83SYohann         ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
741*241a4b83SYohann         if (isTensor)
742*241a4b83SYohann         {
743*241a4b83SYohann           //TODO check that all are the same
744*241a4b83SYohann           ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
745*241a4b83SYohann         } else {
746*241a4b83SYohann           return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
747*241a4b83SYohann         }
748*241a4b83SYohann       }
749*241a4b83SYohann     }
750*241a4b83SYohann   }
751*241a4b83SYohann   data->dim = dim;
752*241a4b83SYohann   data->Q1d = Q1d;
753*241a4b83SYohann 
754*241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
755*241a4b83SYohann     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
756*241a4b83SYohann   }
757*241a4b83SYohann   code << "const CeedInt Dim = "<<dim<<";\n";
758*241a4b83SYohann   code << "const CeedInt Q1d = "<<Q1d<<";\n";
759*241a4b83SYohann   // code << "const CeedInt Q   = "<<Q<<";\n";
760*241a4b83SYohann   code << "extern __shared__ CeedScalar slice[];\n";
761*241a4b83SYohann   code << "BackendData data;\n";
762*241a4b83SYohann   code << "data.tidx = threadIdx.x;\n";
763*241a4b83SYohann   code << "data.tidy = threadIdx.y;\n";
764*241a4b83SYohann   code << "data.tidz = threadIdx.z;\n";
765*241a4b83SYohann   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
766*241a4b83SYohann   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
767*241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
768*241a4b83SYohann     code << "// Input field "<<i<<"\n";
769*241a4b83SYohann     // Get elemsize, emode, ncomp
770*241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
771*241a4b83SYohann     CeedChk(ierr);
772*241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
773*241a4b83SYohann     CeedChk(ierr);
774*241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
775*241a4b83SYohann     CeedChk(ierr);
776*241a4b83SYohann     ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
777*241a4b83SYohann     CeedChk(ierr);
778*241a4b83SYohann     // Basis action
779*241a4b83SYohann     switch (emode) {
780*241a4b83SYohann     case CEED_EVAL_NONE:
781*241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
782*241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
783*241a4b83SYohann       code << "  const CeedInt nquads_in_"<<i<<" = "<<ndof/ncomp<<";\n";
784*241a4b83SYohann       break;
785*241a4b83SYohann     case CEED_EVAL_INTERP:
786*241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
787*241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
788*241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
789*241a4b83SYohann       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
790*241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
791*241a4b83SYohann       code << "  const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n";
792*241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
793*241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
794*241a4b83SYohann       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
795*241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
796*241a4b83SYohann       break;
797*241a4b83SYohann     case CEED_EVAL_GRAD:
798*241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
799*241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
800*241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
801*241a4b83SYohann       code << "  const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n";
802*241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
803*241a4b83SYohann       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
804*241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
805*241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
806*241a4b83SYohann       data->G.in[i] = basis_data->d_grad1d;
807*241a4b83SYohann       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
808*241a4b83SYohann       code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
809*241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
810*241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
811*241a4b83SYohann       break;
812*241a4b83SYohann     case CEED_EVAL_WEIGHT:
813*241a4b83SYohann       break; // No action
814*241a4b83SYohann     case CEED_EVAL_DIV:
815*241a4b83SYohann       break; // TODO: Not implemented
816*241a4b83SYohann     case CEED_EVAL_CURL:
817*241a4b83SYohann       break; // TODO: Not implemented
818*241a4b83SYohann     }
819*241a4b83SYohann   }
820*241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
821*241a4b83SYohann     code << "// Output field "<<i<<"\n";
822*241a4b83SYohann     // Get elemsize, emode, ncomp
823*241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
824*241a4b83SYohann     CeedChk(ierr);
825*241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
826*241a4b83SYohann     CeedChk(ierr);
827*241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
828*241a4b83SYohann     CeedChk(ierr);
829*241a4b83SYohann     ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
830*241a4b83SYohann     CeedChk(ierr);
831*241a4b83SYohann     // Basis action
832*241a4b83SYohann     switch (emode) {
833*241a4b83SYohann     case CEED_EVAL_NONE:
834*241a4b83SYohann       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
835*241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
836*241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
837*241a4b83SYohann       code << "  const CeedInt nquads_out_"<<i<<" = "<<ndof<<"/ncomp_out_"<<i<<";\n";
838*241a4b83SYohann       break; // No action
839*241a4b83SYohann     case CEED_EVAL_INTERP:
840*241a4b83SYohann       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
841*241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
842*241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
843*241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
844*241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
845*241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
846*241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
847*241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
848*241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
849*241a4b83SYohann       code << "  const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n";
850*241a4b83SYohann       break;
851*241a4b83SYohann     case CEED_EVAL_GRAD:
852*241a4b83SYohann       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
853*241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
854*241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
855*241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
856*241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
857*241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
858*241a4b83SYohann       data->G.out[i] = basis_data->d_grad1d;
859*241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
860*241a4b83SYohann       code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
861*241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
862*241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
863*241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
864*241a4b83SYohann       code << "  const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n";
865*241a4b83SYohann       break;
866*241a4b83SYohann     case CEED_EVAL_WEIGHT: {
867*241a4b83SYohann       Ceed ceed;
868*241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
869*241a4b83SYohann       return CeedError(ceed, 1,
870*241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
871*241a4b83SYohann       break; // Should not occur
872*241a4b83SYohann     }
873*241a4b83SYohann     case CEED_EVAL_DIV:
874*241a4b83SYohann       break; // TODO: Not implemented
875*241a4b83SYohann     case CEED_EVAL_CURL:
876*241a4b83SYohann       break; // TODO: Not implemented
877*241a4b83SYohann     }
878*241a4b83SYohann   }
879*241a4b83SYohann   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
880*241a4b83SYohann   // Input basis apply if needed
881*241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
882*241a4b83SYohann     code << "// Input field "<<i<<"\n";
883*241a4b83SYohann     // Get elemsize, emode, ncomp
884*241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
885*241a4b83SYohann     CeedChk(ierr);
886*241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
887*241a4b83SYohann     CeedChk(ierr);
888*241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
889*241a4b83SYohann     CeedChk(ierr);
890*241a4b83SYohann     ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
891*241a4b83SYohann     CeedChk(ierr);
892*241a4b83SYohann     // Basis action
893*241a4b83SYohann     switch (emode) {
894*241a4b83SYohann     case CEED_EVAL_NONE:
895*241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
896*241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
897*241a4b83SYohann       code << "  readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n";
898*241a4b83SYohann       break;
899*241a4b83SYohann     case CEED_EVAL_INTERP:
900*241a4b83SYohann       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
901*241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
902*241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
903*241a4b83SYohann       data->indices.in[i] = restr_data->d_ind;
904*241a4b83SYohann       code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
905*241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
906*241a4b83SYohann       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
907*241a4b83SYohann       break;
908*241a4b83SYohann     case CEED_EVAL_GRAD:
909*241a4b83SYohann       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
910*241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
911*241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
912*241a4b83SYohann       data->indices.in[i] = restr_data->d_ind;
913*241a4b83SYohann       code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
914*241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
915*241a4b83SYohann       code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
916*241a4b83SYohann       break;
917*241a4b83SYohann     case CEED_EVAL_WEIGHT:
918*241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
919*241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
920*241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
921*241a4b83SYohann       data->W = basis_data->d_qweight1d;
922*241a4b83SYohann       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
923*241a4b83SYohann       break; // No action
924*241a4b83SYohann     case CEED_EVAL_DIV:
925*241a4b83SYohann       break; // TODO: Not implemented
926*241a4b83SYohann     case CEED_EVAL_CURL:
927*241a4b83SYohann       break; // TODO: Not implemented
928*241a4b83SYohann     }
929*241a4b83SYohann   }
930*241a4b83SYohann   // Q function
931*241a4b83SYohann   code << "// QFunction\n";
932*241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
933*241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
934*241a4b83SYohann     CeedChk(ierr);
935*241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
936*241a4b83SYohann     {
937*241a4b83SYohann       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
938*241a4b83SYohann     }
939*241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
940*241a4b83SYohann     {
941*241a4b83SYohann       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
942*241a4b83SYohann     }
943*241a4b83SYohann   }
944*241a4b83SYohann   //TODO write qfunction load for this backend
945*241a4b83SYohann   string qFunctionName(qf_data->qFunctionName);
946*241a4b83SYohann   code << "  "<<qFunctionName<<"(ctx, "<<(dim==3?"Q1d":"1")<<", ";
947*241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
948*241a4b83SYohann     code << "r_t"<<i<<", ";
949*241a4b83SYohann   }
950*241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
951*241a4b83SYohann     code << "r_tt"<<i;
952*241a4b83SYohann     if (i<numoutputfields-1)
953*241a4b83SYohann     {
954*241a4b83SYohann       code << ", ";
955*241a4b83SYohann     }
956*241a4b83SYohann   }
957*241a4b83SYohann   code << ");\n";
958*241a4b83SYohann 
959*241a4b83SYohann   // Output basis apply if needed
960*241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
961*241a4b83SYohann     code << "// Output field "<<i<<"\n";
962*241a4b83SYohann     // Get elemsize, emode, ncomp
963*241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
964*241a4b83SYohann     CeedChk(ierr);
965*241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
966*241a4b83SYohann     CeedChk(ierr);
967*241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
968*241a4b83SYohann     CeedChk(ierr);
969*241a4b83SYohann     ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
970*241a4b83SYohann     CeedChk(ierr);
971*241a4b83SYohann     // Basis action
972*241a4b83SYohann     switch (emode) {
973*241a4b83SYohann     case CEED_EVAL_NONE:
974*241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
975*241a4b83SYohann       code << "  writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n";
976*241a4b83SYohann       break; // No action
977*241a4b83SYohann     case CEED_EVAL_INTERP:
978*241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
979*241a4b83SYohann       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
980*241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
981*241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
982*241a4b83SYohann       data->indices.out[i] = restr_data->d_ind;
983*241a4b83SYohann       code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
984*241a4b83SYohann       break;
985*241a4b83SYohann     case CEED_EVAL_GRAD:
986*241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
987*241a4b83SYohann       code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
988*241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
989*241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
990*241a4b83SYohann       data->indices.out[i] = restr_data->d_ind;
991*241a4b83SYohann       code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
992*241a4b83SYohann       break;
993*241a4b83SYohann     case CEED_EVAL_WEIGHT: {
994*241a4b83SYohann       Ceed ceed;
995*241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
996*241a4b83SYohann       return CeedError(ceed, 1,
997*241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
998*241a4b83SYohann       break; // Should not occur
999*241a4b83SYohann     }
1000*241a4b83SYohann     case CEED_EVAL_DIV:
1001*241a4b83SYohann       break; // TODO: Not implemented
1002*241a4b83SYohann     case CEED_EVAL_CURL:
1003*241a4b83SYohann       break; // TODO: Not implemented
1004*241a4b83SYohann     }
1005*241a4b83SYohann   }
1006*241a4b83SYohann 
1007*241a4b83SYohann   code << "  }\n";
1008*241a4b83SYohann   code << "}\n\n";
1009*241a4b83SYohann 
1010*241a4b83SYohann   // std::cout << code.str();
1011*241a4b83SYohann 
1012*241a4b83SYohann   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr);
1013*241a4b83SYohann   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1014*241a4b83SYohann   CeedChk(ierr);
1015*241a4b83SYohann 
1016*241a4b83SYohann   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1017*241a4b83SYohann 
1018*241a4b83SYohann   return 0;
1019*241a4b83SYohann }
1020