xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision f1a13f77bb620dbda785a2856947d36bf01d5e9a)
1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details.
4241a4b83SYohann //
5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software
6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral
7241a4b83SYohann // element discretizations for exascale applications. For more information and
8241a4b83SYohann // source code availability see http://github.com/ceed.
9241a4b83SYohann //
10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office
12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for
13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including
14241a4b83SYohann // software, applications, hardware, advanced system engineering and early
15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative.
16241a4b83SYohann #include <ceed-backend.h>
17241a4b83SYohann #include "ceed-cuda-gen.h"
18241a4b83SYohann #include <iostream>
19241a4b83SYohann #include <sstream>
20241a4b83SYohann #include "../cuda/ceed-cuda.h"
21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h"
22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
23241a4b83SYohann 
24*f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE(
25241a4b83SYohann __device__ double atomicAdd(double *address, double val) {
26241a4b83SYohann   unsigned long long int *address_as_ull = (unsigned long long int *)address;
27241a4b83SYohann   unsigned long long int old = *address_as_ull, assumed;
28241a4b83SYohann   do {
29241a4b83SYohann     assumed = old;
30241a4b83SYohann     old =
31241a4b83SYohann       atomicCAS(address_as_ull, assumed,
32241a4b83SYohann                 __double_as_longlong(val +
33241a4b83SYohann                                      __longlong_as_double(assumed)));
34241a4b83SYohann     // Note: uses integer comparison to avoid hang in case of NaN
35241a4b83SYohann     // (since NaN != NaN)
36241a4b83SYohann   } while (assumed != old);
37241a4b83SYohann   return __longlong_as_double(old);
38241a4b83SYohann }
39*f1a13f77SYohann Dudouit );
40*f1a13f77SYohann Dudouit 
41*f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE(
42*f1a13f77SYohann Dudouit 
43*f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
44*f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
45*f1a13f77SYohann Dudouit 
46*f1a13f77SYohann Dudouit typedef struct {
47*f1a13f77SYohann Dudouit   CeedInt tidx;
48*f1a13f77SYohann Dudouit   CeedInt tidy;
49*f1a13f77SYohann Dudouit   CeedInt tidz;
50*f1a13f77SYohann Dudouit   CeedInt tid;
51*f1a13f77SYohann Dudouit   CeedScalar* slice;
52*f1a13f77SYohann Dudouit } BackendData;
53241a4b83SYohann 
54241a4b83SYohann template <int P, int Q>
55241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
56241a4b83SYohann   for(int i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
57241a4b83SYohann     B[i] = d_B[i];
58241a4b83SYohann   }
59241a4b83SYohann   __syncthreads;
60241a4b83SYohann }
61241a4b83SYohann 
62241a4b83SYohann //****
63241a4b83SYohann // 1D
64241a4b83SYohann template <int NCOMP, int P1d>
65241a4b83SYohann inline __device__ void readDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
66241a4b83SYohann   if (data.tidx<P1d)
67241a4b83SYohann   {
68241a4b83SYohann     const CeedInt dof = data.tidx;
69241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
70241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
71241a4b83SYohann       r_u[comp] = d_u[ind + ndofs * comp];
72241a4b83SYohann     }
73241a4b83SYohann   }
74241a4b83SYohann }
75241a4b83SYohann 
76241a4b83SYohann template <int NCOMP, int P1d>
77241a4b83SYohann inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
78241a4b83SYohann   if (data.tidx<P1d)
79241a4b83SYohann   {
80241a4b83SYohann     const CeedInt dof = data.tidx;
81241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
82241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
83241a4b83SYohann       r_u[comp] = d_u[ind * NCOMP + comp];
84241a4b83SYohann     }
85241a4b83SYohann   }
86241a4b83SYohann }
87241a4b83SYohann 
88241a4b83SYohann template <int NCOMP, int Q1d>
89241a4b83SYohann inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
90241a4b83SYohann   const CeedInt dof = data.tidx;
91241a4b83SYohann   const CeedInt ind = dof + elem * Q1d;
92241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
93241a4b83SYohann     r_u[comp] = d_u[ind + nquads * comp];
94241a4b83SYohann   }
95241a4b83SYohann }
96241a4b83SYohann 
97241a4b83SYohann template <int NCOMP, int Q1d>
98241a4b83SYohann inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
99241a4b83SYohann   const CeedInt dof = data.tidx;
100241a4b83SYohann   const CeedInt ind = dof + elem * Q1d;
101241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
102241a4b83SYohann     r_u[comp] = d_u[ind * NCOMP + comp];
103241a4b83SYohann   }
104241a4b83SYohann }
105241a4b83SYohann 
106241a4b83SYohann template <int NCOMP, int P1d>
107241a4b83SYohann inline __device__ void writeDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
108241a4b83SYohann   if (data.tidx<P1d)
109241a4b83SYohann   {
110241a4b83SYohann     const CeedInt dof = data.tidx;
111241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
112241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
113241a4b83SYohann       atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]);
114241a4b83SYohann     }
115241a4b83SYohann   }
116241a4b83SYohann }
117241a4b83SYohann 
118241a4b83SYohann template <int NCOMP, int P1d>
119241a4b83SYohann inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
120241a4b83SYohann   if (data.tidx<P1d)
121241a4b83SYohann   {
122241a4b83SYohann     const CeedInt dof = data.tidx;
123241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
124241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
125241a4b83SYohann       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
126241a4b83SYohann     }
127241a4b83SYohann   }
128241a4b83SYohann }
129241a4b83SYohann 
130241a4b83SYohann template <int NCOMP, int Q1d>
131241a4b83SYohann inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
132241a4b83SYohann   const CeedInt dof = data.tidx;
133241a4b83SYohann   const CeedInt ind = dof + elem * Q1d;
134241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
135241a4b83SYohann     d_v[ind + nquads * comp] = r_v[comp];
136241a4b83SYohann   }
137241a4b83SYohann }
138241a4b83SYohann 
139241a4b83SYohann template <int NCOMP, int Q1d>
140241a4b83SYohann inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
141241a4b83SYohann   const CeedInt dof = data.tidx;
142241a4b83SYohann   const CeedInt ind = dof + elem * Q1d;
143241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
144241a4b83SYohann     d_v[ind * NCOMP + comp] = r_v[comp];
145241a4b83SYohann   }
146241a4b83SYohann }
147241a4b83SYohann 
148241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
149241a4b83SYohann inline __device__ void ContractX1d(BackendData& data,
150241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
151241a4b83SYohann   data.slice[data.tidx] = *U;
152241a4b83SYohann   __syncthreads();
153241a4b83SYohann   *V = 0.0;
154241a4b83SYohann   for (int i = 0; i < P1d; ++i) {
155241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
156241a4b83SYohann   }
157241a4b83SYohann   __syncthreads();
158241a4b83SYohann }
159241a4b83SYohann 
160241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
161241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data,
162241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
163241a4b83SYohann   data.slice[data.tidx] = *U;
164241a4b83SYohann   __syncthreads();
165241a4b83SYohann   *V = 0.0;
166241a4b83SYohann   for (int i = 0; i < Q1d; ++i) {
167241a4b83SYohann     *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
168241a4b83SYohann   }
169241a4b83SYohann   __syncthreads();
170241a4b83SYohann }
171241a4b83SYohann 
172241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
173241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
174241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
175241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
176241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
177241a4b83SYohann   }
178241a4b83SYohann }
179241a4b83SYohann 
180241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
181241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
182241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
183241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
184241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
185241a4b83SYohann   }
186241a4b83SYohann }
187241a4b83SYohann 
188241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
189241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
190241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
191241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
192241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
193241a4b83SYohann     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
194241a4b83SYohann   }
195241a4b83SYohann }
196241a4b83SYohann 
197241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
198241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
199241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
200241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
201241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
202241a4b83SYohann     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
203241a4b83SYohann   }
204241a4b83SYohann }
205241a4b83SYohann 
206241a4b83SYohann //****
207241a4b83SYohann // 2D
208241a4b83SYohann template <int NCOMP, int P1d>
209241a4b83SYohann inline __device__ void readDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
210241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
211241a4b83SYohann   {
212241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*P1d;
213241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
214241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
215241a4b83SYohann       r_u[comp] = d_u[ind + ndofs * comp];
216241a4b83SYohann     }
217241a4b83SYohann   }
218241a4b83SYohann }
219241a4b83SYohann 
220241a4b83SYohann template <int NCOMP, int P1d>
221241a4b83SYohann inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
222241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
223241a4b83SYohann   {
224241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*P1d;
225241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
226241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
227241a4b83SYohann       r_u[comp] = d_u[ind * NCOMP + comp];
228241a4b83SYohann     }
229241a4b83SYohann   }
230241a4b83SYohann }
231241a4b83SYohann 
232241a4b83SYohann template <int NCOMP, int Q1d>
233241a4b83SYohann inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
234241a4b83SYohann   const CeedInt dof = data.tidx + data.tidy*Q1d;
235241a4b83SYohann   const CeedInt ind = dof + elem * Q1d*Q1d;
236241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
237241a4b83SYohann     r_u[comp] = d_u[ind + nquads * comp];
238241a4b83SYohann   }
239241a4b83SYohann }
240241a4b83SYohann 
241241a4b83SYohann template <int NCOMP, int Q1d>
242241a4b83SYohann inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
243241a4b83SYohann   const CeedInt dof = data.tidx + data.tidy*Q1d;
244241a4b83SYohann   const CeedInt ind = dof + elem * Q1d*Q1d;
245241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
246241a4b83SYohann     r_u[comp] = d_u[ind * NCOMP + comp];
247241a4b83SYohann   }
248241a4b83SYohann }
249241a4b83SYohann 
250241a4b83SYohann template <int NCOMP, int P1d>
251241a4b83SYohann inline __device__ void writeDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
252241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
253241a4b83SYohann   {
254241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*P1d;
255241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
256241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
257241a4b83SYohann       atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]);
258241a4b83SYohann     }
259241a4b83SYohann   }
260241a4b83SYohann }
261241a4b83SYohann 
262241a4b83SYohann template <int NCOMP, int P1d>
263241a4b83SYohann inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
264241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d)
265241a4b83SYohann   {
266241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*P1d;
267241a4b83SYohann     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
268241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
269241a4b83SYohann       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
270241a4b83SYohann     }
271241a4b83SYohann   }
272241a4b83SYohann }
273241a4b83SYohann 
274241a4b83SYohann template <int NCOMP, int Q1d>
275241a4b83SYohann inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
276241a4b83SYohann   const CeedInt dof = data.tidx + data.tidy*Q1d;
277241a4b83SYohann   const CeedInt ind = dof + elem * Q1d*Q1d;
278241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
279241a4b83SYohann     d_v[ind + nquads * comp] = r_v[comp];
280241a4b83SYohann   }
281241a4b83SYohann }
282241a4b83SYohann 
283241a4b83SYohann template <int NCOMP, int Q1d>
284241a4b83SYohann inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
285241a4b83SYohann   const CeedInt dof = data.tidx + data.tidy*Q1d;
286241a4b83SYohann   const CeedInt ind = dof + elem * Q1d*Q1d;
287241a4b83SYohann   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
288241a4b83SYohann     d_v[ind * NCOMP + comp] = r_v[comp];
289241a4b83SYohann   }
290241a4b83SYohann }
291241a4b83SYohann 
292241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
293241a4b83SYohann inline __device__ void ContractX2d(BackendData& data,
294241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
295241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
296241a4b83SYohann   __syncthreads();
297241a4b83SYohann   *V = 0.0;
298241a4b83SYohann   for (int i = 0; i < P1d; ++i) {
299241a4b83SYohann     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
300241a4b83SYohann   }
301241a4b83SYohann   __syncthreads();
302241a4b83SYohann }
303241a4b83SYohann 
304241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
305241a4b83SYohann inline __device__ void ContractY2d(BackendData& data,
306241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
307241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
308241a4b83SYohann   __syncthreads();
309241a4b83SYohann   *V = 0.0;
310241a4b83SYohann   for (int i = 0; i < P1d; ++i) {
311241a4b83SYohann     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
312241a4b83SYohann   }
313241a4b83SYohann   __syncthreads();
314241a4b83SYohann }
315241a4b83SYohann 
316241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
317241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data,
318241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
319241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
320241a4b83SYohann   __syncthreads();
321241a4b83SYohann   *V = 0.0;
322241a4b83SYohann   if (data.tidy<P1d) {
323241a4b83SYohann     for (int i = 0; i < Q1d; ++i) {
324241a4b83SYohann       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
325241a4b83SYohann     }
326241a4b83SYohann   }
327241a4b83SYohann   __syncthreads();
328241a4b83SYohann }
329241a4b83SYohann 
330241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
331241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data,
332241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
333241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
334241a4b83SYohann   __syncthreads();
335241a4b83SYohann   *V = 0.0;
336241a4b83SYohann   if (data.tidx<P1d) {
337241a4b83SYohann     for (int i = 0; i < Q1d; ++i) {
338241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
339241a4b83SYohann     }
340241a4b83SYohann   }
341241a4b83SYohann   __syncthreads();
342241a4b83SYohann }
343241a4b83SYohann 
344241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
345241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data,
346241a4b83SYohann                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
347241a4b83SYohann   data.slice[data.tidx+data.tidy*Q1d] = *U;
348241a4b83SYohann   __syncthreads();
349241a4b83SYohann   if (data.tidx<P1d) {
350241a4b83SYohann     for (int i = 0; i < Q1d; ++i) {
351241a4b83SYohann       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
352241a4b83SYohann     }
353241a4b83SYohann   }
354241a4b83SYohann   __syncthreads();
355241a4b83SYohann }
356241a4b83SYohann 
357241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
358241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
359241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
360241a4b83SYohann   CeedScalar r_t[1];
361241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
362241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
363241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
364241a4b83SYohann   }
365241a4b83SYohann }
366241a4b83SYohann 
367241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
368241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
369241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
370241a4b83SYohann   CeedScalar r_t[1];
371241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
372241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
373241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
374241a4b83SYohann   }
375241a4b83SYohann }
376241a4b83SYohann 
377241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
378241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
379241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
380241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
381241a4b83SYohann   CeedScalar r_t[1];
382241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
383241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
384241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
385241a4b83SYohann     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
386241a4b83SYohann     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
387241a4b83SYohann   }
388241a4b83SYohann }
389241a4b83SYohann 
390241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
391241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
392241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
393241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
394241a4b83SYohann   CeedScalar r_t[1];
395241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
396241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
397241a4b83SYohann     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
398241a4b83SYohann     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
399241a4b83SYohann     ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
400241a4b83SYohann   }
401241a4b83SYohann }
402241a4b83SYohann 
403241a4b83SYohann //****
404241a4b83SYohann // 3D
405241a4b83SYohann template <int NCOMP, int P1d>
406241a4b83SYohann inline __device__ void readDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
407241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
408241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
409241a4b83SYohann       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
410241a4b83SYohann       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
411241a4b83SYohann       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
412241a4b83SYohann         r_u[z+comp*P1d] = d_u[ind + ndofs * comp];
413241a4b83SYohann       }
414241a4b83SYohann     }
415241a4b83SYohann   }
416241a4b83SYohann }
417241a4b83SYohann 
418241a4b83SYohann template <int NCOMP, int P1d>
419241a4b83SYohann inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
420241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
421241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
422241a4b83SYohann       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
423241a4b83SYohann       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
424241a4b83SYohann       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
425241a4b83SYohann         r_u[z+comp*P1d] = d_u[ind * NCOMP + comp];
426241a4b83SYohann       }
427241a4b83SYohann     }
428241a4b83SYohann   }
429241a4b83SYohann }
430241a4b83SYohann 
431241a4b83SYohann template <int NCOMP, int Q1d>
432241a4b83SYohann inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
433241a4b83SYohann   for(CeedInt z=0; z < Q1d; ++z) {
434241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
435241a4b83SYohann     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
436241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
437241a4b83SYohann       r_u[z+comp*Q1d] = d_u[ind + nquads * comp];
438241a4b83SYohann     }
439241a4b83SYohann   }
440241a4b83SYohann }
441241a4b83SYohann 
442241a4b83SYohann template <int NCOMP, int Q1d>
443241a4b83SYohann inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
444241a4b83SYohann   for(CeedInt z=0; z < Q1d; ++z) {
445241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
446241a4b83SYohann     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
447241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
448241a4b83SYohann       r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp];
449241a4b83SYohann     }
450241a4b83SYohann   }
451241a4b83SYohann }
452241a4b83SYohann 
453241a4b83SYohann template <int NCOMP, int P1d>
454241a4b83SYohann inline __device__ void writeDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
455241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
456241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
457241a4b83SYohann       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
458241a4b83SYohann       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
459241a4b83SYohann       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
460241a4b83SYohann         atomicAdd(&d_v[ind + ndofs * comp], r_v[z+comp*P1d]);
461241a4b83SYohann       }
462241a4b83SYohann     }
463241a4b83SYohann   }
464241a4b83SYohann }
465241a4b83SYohann 
466241a4b83SYohann template <int NCOMP, int P1d>
467241a4b83SYohann inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
468241a4b83SYohann   if (data.tidx<P1d && data.tidy<P1d) {
469241a4b83SYohann     for (CeedInt z = 0; z < P1d; ++z) {
470241a4b83SYohann       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
471241a4b83SYohann       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
472241a4b83SYohann       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
473241a4b83SYohann         atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]);
474241a4b83SYohann       }
475241a4b83SYohann     }
476241a4b83SYohann   }
477241a4b83SYohann }
478241a4b83SYohann 
479241a4b83SYohann template <int NCOMP, int Q1d>
480241a4b83SYohann inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
481241a4b83SYohann   for(CeedInt z=0; z < Q1d; ++z) {
482241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
483241a4b83SYohann     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
484241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
485241a4b83SYohann       d_v[ind + nquads * comp] = r_v[z+comp*Q1d];
486241a4b83SYohann     }
487241a4b83SYohann   }
488241a4b83SYohann }
489241a4b83SYohann 
490241a4b83SYohann template <int NCOMP, int Q1d>
491241a4b83SYohann inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
492241a4b83SYohann   for(CeedInt z=0; z < Q1d; ++z) {
493241a4b83SYohann     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
494241a4b83SYohann     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
495241a4b83SYohann     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
496241a4b83SYohann       d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d];
497241a4b83SYohann     }
498241a4b83SYohann   }
499241a4b83SYohann }
500241a4b83SYohann 
501241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
502241a4b83SYohann inline __device__ void ContractX3d(BackendData& data,
503241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
504241a4b83SYohann   for (int k = 0; k < P1d; ++k) {
505241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
506241a4b83SYohann     __syncthreads();
507241a4b83SYohann     V[k] = 0.0;
508241a4b83SYohann     for (int i = 0; i < P1d; ++i) {
509241a4b83SYohann       V[k] += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
510241a4b83SYohann     }
511241a4b83SYohann     __syncthreads();
512241a4b83SYohann   }
513241a4b83SYohann }
514241a4b83SYohann 
515241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
516241a4b83SYohann inline __device__ void ContractY3d(BackendData& data,
517241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
518241a4b83SYohann   for (int k = 0; k < P1d; ++k) {
519241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
520241a4b83SYohann     __syncthreads();
521241a4b83SYohann     V[k] = 0.0;
522241a4b83SYohann     for (int i = 0; i < P1d; ++i) {
523241a4b83SYohann       V[k] += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
524241a4b83SYohann     }
525241a4b83SYohann     __syncthreads();
526241a4b83SYohann   }
527241a4b83SYohann }
528241a4b83SYohann 
529241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
530241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data,
531241a4b83SYohann                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
532241a4b83SYohann   for (int k = 0; k < Q1d; ++k) {
533241a4b83SYohann     V[k] = 0.0;
534241a4b83SYohann     for (int i = 0; i < P1d; ++i) {
535241a4b83SYohann       V[k] += B[i + k*P1d] * U[i];//contract z direction
536241a4b83SYohann     }
537241a4b83SYohann   }
538241a4b83SYohann }
539241a4b83SYohann 
540241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
541241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data,
542241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
543241a4b83SYohann   for (int k = 0; k < Q1d; ++k) {
544241a4b83SYohann     V[k] = 0.0;
545241a4b83SYohann     if (k<P1d) {
546241a4b83SYohann       for (int i = 0; i < Q1d; ++i) {
547241a4b83SYohann         V[k] += B[k + i*P1d] * U[i];//contract z direction
548241a4b83SYohann       }
549241a4b83SYohann     }
550241a4b83SYohann   }
551241a4b83SYohann }
552241a4b83SYohann 
553241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
554241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data,
555241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
556241a4b83SYohann   for (int k = 0; k < P1d; ++k) {
557241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
558241a4b83SYohann     __syncthreads();
559241a4b83SYohann     V[k] = 0.0;
560241a4b83SYohann     if (data.tidy<P1d) {
561241a4b83SYohann       for (int i = 0; i < Q1d; ++i) {
562241a4b83SYohann         V[k] += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
563241a4b83SYohann       }
564241a4b83SYohann     }
565241a4b83SYohann     __syncthreads();
566241a4b83SYohann   }
567241a4b83SYohann }
568241a4b83SYohann 
569241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
570241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data,
571241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
572241a4b83SYohann   for (int k = 0; k < P1d; ++k) {
573241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
574241a4b83SYohann     __syncthreads();
575241a4b83SYohann     V[k] = 0.0;
576241a4b83SYohann     if (data.tidx<P1d) {
577241a4b83SYohann       for (int i = 0; i < Q1d; ++i) {
578241a4b83SYohann         V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
579241a4b83SYohann       }
580241a4b83SYohann     }
581241a4b83SYohann     __syncthreads();
582241a4b83SYohann   }
583241a4b83SYohann }
584241a4b83SYohann 
585241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
586241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data,
587241a4b83SYohann     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
588241a4b83SYohann   for (int k = 0; k < P1d; ++k) {
589241a4b83SYohann     data.slice[data.tidx+data.tidy*Q1d] = U[k];
590241a4b83SYohann     __syncthreads();
591241a4b83SYohann     if (data.tidx<P1d) {
592241a4b83SYohann       for (int i = 0; i < Q1d; ++i) {
593241a4b83SYohann         V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
594241a4b83SYohann       }
595241a4b83SYohann     }
596241a4b83SYohann     __syncthreads();
597241a4b83SYohann   }
598241a4b83SYohann }
599241a4b83SYohann 
600241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
601241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
602241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
603241a4b83SYohann   CeedScalar r_t1[Q1d];
604241a4b83SYohann   CeedScalar r_t2[Q1d];
605241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
606241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
607241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
608241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
609241a4b83SYohann   }
610241a4b83SYohann }
611241a4b83SYohann 
612241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
613241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
614241a4b83SYohann                                 CeedScalar *__restrict__ r_V) {
615241a4b83SYohann   CeedScalar r_t1[Q1d];
616241a4b83SYohann   CeedScalar r_t2[Q1d];
617241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
618241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
619241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
620241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
621241a4b83SYohann   }
622241a4b83SYohann }
623241a4b83SYohann 
624241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
625241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
626241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
627241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
628241a4b83SYohann   CeedScalar r_t1[Q1d];
629241a4b83SYohann   CeedScalar r_t2[Q1d];
630241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
631241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
632241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
633241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
634241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
635241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
636241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
637241a4b83SYohann     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
638241a4b83SYohann     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
639241a4b83SYohann     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
640241a4b83SYohann   }
641241a4b83SYohann }
642241a4b83SYohann 
643241a4b83SYohann template <int NCOMP, int P1d, int Q1d>
644241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
645241a4b83SYohann                               const CeedScalar *c_B, const CeedScalar *c_G,
646241a4b83SYohann                               CeedScalar *__restrict__ r_V) {
647241a4b83SYohann   CeedScalar r_t1[Q1d];
648241a4b83SYohann   CeedScalar r_t2[Q1d];
649241a4b83SYohann   for(int comp=0; comp<NCOMP; comp++) {
650241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
651241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
652241a4b83SYohann     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
653241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
654241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
655241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
656241a4b83SYohann     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
657241a4b83SYohann     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
658241a4b83SYohann     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
659241a4b83SYohann   }
660241a4b83SYohann }
661241a4b83SYohann 
662241a4b83SYohann template <int Q1d>
663241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
664241a4b83SYohann   *w = qweight1d[data.tidx];
665241a4b83SYohann }
666241a4b83SYohann 
667241a4b83SYohann template <int Q1d>
668241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
669241a4b83SYohann   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
670241a4b83SYohann }
671241a4b83SYohann 
672241a4b83SYohann template <int Q1d>
673241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
674241a4b83SYohann   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
675241a4b83SYohann   for (int z = 0; z < Q1d; ++z)
676241a4b83SYohann   {
677241a4b83SYohann     w[z] = pw*qweight1d[z];
678241a4b83SYohann   }
679241a4b83SYohann }
680241a4b83SYohann 
681241a4b83SYohann );
682241a4b83SYohann 
683241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
684241a4b83SYohann 
685241a4b83SYohann 	using std::ostringstream;
686241a4b83SYohann   using std::string;
687241a4b83SYohann   int ierr;
688241a4b83SYohann   bool setupdone;
689241a4b83SYohann   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
690241a4b83SYohann   if (setupdone) return 0;
691241a4b83SYohann   Ceed ceed;
692241a4b83SYohann   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
693241a4b83SYohann   CeedOperator_Cuda_gen *data;
694241a4b83SYohann   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
695241a4b83SYohann   CeedQFunction qf;
696241a4b83SYohann   CeedQFunction_Cuda_gen *qf_data;
697241a4b83SYohann   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
698241a4b83SYohann   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
699241a4b83SYohann   CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, ndof;
700241a4b83SYohann   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
701241a4b83SYohann   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
702241a4b83SYohann   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
703241a4b83SYohann   CeedChk(ierr);
704241a4b83SYohann   CeedOperatorField *opinputfields, *opoutputfields;
705241a4b83SYohann   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
706241a4b83SYohann   CeedChk(ierr);
707241a4b83SYohann   CeedQFunctionField *qfinputfields, *qfoutputfields;
708241a4b83SYohann   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
709241a4b83SYohann   CeedChk(ierr);
710241a4b83SYohann   CeedEvalMode emode;
711241a4b83SYohann   CeedTransposeMode lmode;
712241a4b83SYohann   CeedBasis basis;
713241a4b83SYohann   CeedBasis_Cuda_shared *basis_data;
714241a4b83SYohann   CeedElemRestriction Erestrict;
715241a4b83SYohann   CeedElemRestriction_Cuda_reg *restr_data;
716241a4b83SYohann 
717241a4b83SYohann   ostringstream code;
718241a4b83SYohann   string devFunctions(deviceFunctions);
719241a4b83SYohann 
720*f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
721*f1a13f77SYohann Dudouit   struct cudaDeviceProp prop;
722*f1a13f77SYohann Dudouit   Ceed delegate;
723*f1a13f77SYohann Dudouit   CeedGetDelegate(ceed, &delegate);
724*f1a13f77SYohann Dudouit   Ceed_Cuda *ceed_data;
725*f1a13f77SYohann Dudouit   ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr);
726*f1a13f77SYohann Dudouit   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
727*f1a13f77SYohann Dudouit   if(prop.major<6){
728*f1a13f77SYohann Dudouit     std::cout<< "I'm an old GPU" << std::endl;
729*f1a13f77SYohann Dudouit     code << atomicAdd;
730*f1a13f77SYohann Dudouit   }
731*f1a13f77SYohann Dudouit 
732241a4b83SYohann   code << devFunctions;
733241a4b83SYohann 
734241a4b83SYohann   string qFunction(qf_data->qFunctionSource);
735241a4b83SYohann   code << qFunction;
736241a4b83SYohann 
737241a4b83SYohann   // Setup
738241a4b83SYohann   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
739241a4b83SYohann   // Input Evecs and Restriction
740241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
741241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
742241a4b83SYohann     CeedChk(ierr);
743241a4b83SYohann     if (emode == CEED_EVAL_WEIGHT) { // Skip
744241a4b83SYohann     } else {
745241a4b83SYohann       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
746241a4b83SYohann       if (emode != CEED_EVAL_NONE)
747241a4b83SYohann       {
748241a4b83SYohann         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
749241a4b83SYohann         bool isTensor;
750241a4b83SYohann         ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
751241a4b83SYohann         //TODO check that all are the same
752241a4b83SYohann         ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
753241a4b83SYohann         if (isTensor)
754241a4b83SYohann         {
755241a4b83SYohann           //TODO check that all are the same
756241a4b83SYohann           ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
757241a4b83SYohann         } else {
758241a4b83SYohann           return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
759241a4b83SYohann         }
760241a4b83SYohann       }
761241a4b83SYohann     }
762241a4b83SYohann   }
763241a4b83SYohann   data->dim = dim;
764241a4b83SYohann   data->Q1d = Q1d;
765241a4b83SYohann 
766241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
767241a4b83SYohann     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
768241a4b83SYohann   }
769241a4b83SYohann   code << "const CeedInt Dim = "<<dim<<";\n";
770241a4b83SYohann   code << "const CeedInt Q1d = "<<Q1d<<";\n";
771241a4b83SYohann   // code << "const CeedInt Q   = "<<Q<<";\n";
772241a4b83SYohann   code << "extern __shared__ CeedScalar slice[];\n";
773241a4b83SYohann   code << "BackendData data;\n";
774241a4b83SYohann   code << "data.tidx = threadIdx.x;\n";
775241a4b83SYohann   code << "data.tidy = threadIdx.y;\n";
776241a4b83SYohann   code << "data.tidz = threadIdx.z;\n";
777241a4b83SYohann   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
778241a4b83SYohann   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
779241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
780241a4b83SYohann     code << "// Input field "<<i<<"\n";
781241a4b83SYohann     // Get elemsize, emode, ncomp
782241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
783241a4b83SYohann     CeedChk(ierr);
784241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
785241a4b83SYohann     CeedChk(ierr);
786241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
787241a4b83SYohann     CeedChk(ierr);
788241a4b83SYohann     ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
789241a4b83SYohann     CeedChk(ierr);
790241a4b83SYohann     // Basis action
791241a4b83SYohann     switch (emode) {
792241a4b83SYohann     case CEED_EVAL_NONE:
793241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
794241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
795241a4b83SYohann       code << "  const CeedInt nquads_in_"<<i<<" = "<<ndof/ncomp<<";\n";
796241a4b83SYohann       break;
797241a4b83SYohann     case CEED_EVAL_INTERP:
798241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
799241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
800241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
801241a4b83SYohann       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
802241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
803241a4b83SYohann       code << "  const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n";
804241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
805241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
806241a4b83SYohann       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
807241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
808241a4b83SYohann       break;
809241a4b83SYohann     case CEED_EVAL_GRAD:
810241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
811241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
812241a4b83SYohann       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
813241a4b83SYohann       code << "  const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n";
814241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
815241a4b83SYohann       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
816241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
817241a4b83SYohann       data->B.in[i] = basis_data->d_interp1d;
818241a4b83SYohann       data->G.in[i] = basis_data->d_grad1d;
819241a4b83SYohann       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
820241a4b83SYohann       code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
821241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
822241a4b83SYohann       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
823241a4b83SYohann       break;
824241a4b83SYohann     case CEED_EVAL_WEIGHT:
825241a4b83SYohann       break; // No action
826241a4b83SYohann     case CEED_EVAL_DIV:
827241a4b83SYohann       break; // TODO: Not implemented
828241a4b83SYohann     case CEED_EVAL_CURL:
829241a4b83SYohann       break; // TODO: Not implemented
830241a4b83SYohann     }
831241a4b83SYohann   }
832241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
833241a4b83SYohann     code << "// Output field "<<i<<"\n";
834241a4b83SYohann     // Get elemsize, emode, ncomp
835241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
836241a4b83SYohann     CeedChk(ierr);
837241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
838241a4b83SYohann     CeedChk(ierr);
839241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
840241a4b83SYohann     CeedChk(ierr);
841241a4b83SYohann     ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
842241a4b83SYohann     CeedChk(ierr);
843241a4b83SYohann     // Basis action
844241a4b83SYohann     switch (emode) {
845241a4b83SYohann     case CEED_EVAL_NONE:
846241a4b83SYohann       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
847241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
848241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
849241a4b83SYohann       code << "  const CeedInt nquads_out_"<<i<<" = "<<ndof<<"/ncomp_out_"<<i<<";\n";
850241a4b83SYohann       break; // No action
851241a4b83SYohann     case CEED_EVAL_INTERP:
852241a4b83SYohann       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
853241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
854241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
855241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
856241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
857241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
858241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
859241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
860241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
861241a4b83SYohann       code << "  const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n";
862241a4b83SYohann       break;
863241a4b83SYohann     case CEED_EVAL_GRAD:
864241a4b83SYohann       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
865241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
866241a4b83SYohann       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
867241a4b83SYohann       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
868241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
869241a4b83SYohann       data->B.out[i] = basis_data->d_interp1d;
870241a4b83SYohann       data->G.out[i] = basis_data->d_grad1d;
871241a4b83SYohann       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
872241a4b83SYohann       code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
873241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
874241a4b83SYohann       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
875241a4b83SYohann       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
876241a4b83SYohann       code << "  const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n";
877241a4b83SYohann       break;
878241a4b83SYohann     case CEED_EVAL_WEIGHT: {
879241a4b83SYohann       Ceed ceed;
880241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
881241a4b83SYohann       return CeedError(ceed, 1,
882241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
883241a4b83SYohann       break; // Should not occur
884241a4b83SYohann     }
885241a4b83SYohann     case CEED_EVAL_DIV:
886241a4b83SYohann       break; // TODO: Not implemented
887241a4b83SYohann     case CEED_EVAL_CURL:
888241a4b83SYohann       break; // TODO: Not implemented
889241a4b83SYohann     }
890241a4b83SYohann   }
891241a4b83SYohann   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
892241a4b83SYohann   // Input basis apply if needed
893241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
894241a4b83SYohann     code << "// Input field "<<i<<"\n";
895241a4b83SYohann     // Get elemsize, emode, ncomp
896241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
897241a4b83SYohann     CeedChk(ierr);
898241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
899241a4b83SYohann     CeedChk(ierr);
900241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
901241a4b83SYohann     CeedChk(ierr);
902241a4b83SYohann     ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
903241a4b83SYohann     CeedChk(ierr);
904241a4b83SYohann     // Basis action
905241a4b83SYohann     switch (emode) {
906241a4b83SYohann     case CEED_EVAL_NONE:
907241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
908241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
909241a4b83SYohann       code << "  readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n";
910241a4b83SYohann       break;
911241a4b83SYohann     case CEED_EVAL_INTERP:
912241a4b83SYohann       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
913241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
914241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
915241a4b83SYohann       data->indices.in[i] = restr_data->d_ind;
916241a4b83SYohann       code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
917241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
918241a4b83SYohann       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
919241a4b83SYohann       break;
920241a4b83SYohann     case CEED_EVAL_GRAD:
921241a4b83SYohann       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
922241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
923241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
924241a4b83SYohann       data->indices.in[i] = restr_data->d_ind;
925241a4b83SYohann       code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
926241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
927241a4b83SYohann       code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
928241a4b83SYohann       break;
929241a4b83SYohann     case CEED_EVAL_WEIGHT:
930241a4b83SYohann       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
931241a4b83SYohann       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
932241a4b83SYohann       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
933241a4b83SYohann       data->W = basis_data->d_qweight1d;
934241a4b83SYohann       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
935241a4b83SYohann       break; // No action
936241a4b83SYohann     case CEED_EVAL_DIV:
937241a4b83SYohann       break; // TODO: Not implemented
938241a4b83SYohann     case CEED_EVAL_CURL:
939241a4b83SYohann       break; // TODO: Not implemented
940241a4b83SYohann     }
941241a4b83SYohann   }
942241a4b83SYohann   // Q function
943241a4b83SYohann   code << "// QFunction\n";
944241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
945241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
946241a4b83SYohann     CeedChk(ierr);
947241a4b83SYohann     if (emode==CEED_EVAL_GRAD)
948241a4b83SYohann     {
949241a4b83SYohann       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
950241a4b83SYohann     }
951241a4b83SYohann     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
952241a4b83SYohann     {
953241a4b83SYohann       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
954241a4b83SYohann     }
955241a4b83SYohann   }
956241a4b83SYohann   //TODO write qfunction load for this backend
957241a4b83SYohann   string qFunctionName(qf_data->qFunctionName);
958241a4b83SYohann   code << "  "<<qFunctionName<<"(ctx, "<<(dim==3?"Q1d":"1")<<", ";
959241a4b83SYohann   for (CeedInt i = 0; i < numinputfields; i++) {
960241a4b83SYohann     code << "r_t"<<i<<", ";
961241a4b83SYohann   }
962241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
963241a4b83SYohann     code << "r_tt"<<i;
964241a4b83SYohann     if (i<numoutputfields-1)
965241a4b83SYohann     {
966241a4b83SYohann       code << ", ";
967241a4b83SYohann     }
968241a4b83SYohann   }
969241a4b83SYohann   code << ");\n";
970241a4b83SYohann 
971241a4b83SYohann   // Output basis apply if needed
972241a4b83SYohann   for (CeedInt i = 0; i < numoutputfields; i++) {
973241a4b83SYohann     code << "// Output field "<<i<<"\n";
974241a4b83SYohann     // Get elemsize, emode, ncomp
975241a4b83SYohann     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
976241a4b83SYohann     CeedChk(ierr);
977241a4b83SYohann     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
978241a4b83SYohann     CeedChk(ierr);
979241a4b83SYohann     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
980241a4b83SYohann     CeedChk(ierr);
981241a4b83SYohann     ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
982241a4b83SYohann     CeedChk(ierr);
983241a4b83SYohann     // Basis action
984241a4b83SYohann     switch (emode) {
985241a4b83SYohann     case CEED_EVAL_NONE:
986241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
987241a4b83SYohann       code << "  writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n";
988241a4b83SYohann       break; // No action
989241a4b83SYohann     case CEED_EVAL_INTERP:
990241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
991241a4b83SYohann       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
992241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
993241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
994241a4b83SYohann       data->indices.out[i] = restr_data->d_ind;
995241a4b83SYohann       code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
996241a4b83SYohann       break;
997241a4b83SYohann     case CEED_EVAL_GRAD:
998241a4b83SYohann       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
999241a4b83SYohann       code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1000241a4b83SYohann       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
1001241a4b83SYohann       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1002241a4b83SYohann       data->indices.out[i] = restr_data->d_ind;
1003241a4b83SYohann       code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1004241a4b83SYohann       break;
1005241a4b83SYohann     case CEED_EVAL_WEIGHT: {
1006241a4b83SYohann       Ceed ceed;
1007241a4b83SYohann       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1008241a4b83SYohann       return CeedError(ceed, 1,
1009241a4b83SYohann                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1010241a4b83SYohann       break; // Should not occur
1011241a4b83SYohann     }
1012241a4b83SYohann     case CEED_EVAL_DIV:
1013241a4b83SYohann       break; // TODO: Not implemented
1014241a4b83SYohann     case CEED_EVAL_CURL:
1015241a4b83SYohann       break; // TODO: Not implemented
1016241a4b83SYohann     }
1017241a4b83SYohann   }
1018241a4b83SYohann 
1019241a4b83SYohann   code << "  }\n";
1020241a4b83SYohann   code << "}\n\n";
1021241a4b83SYohann 
1022241a4b83SYohann   // std::cout << code.str();
1023241a4b83SYohann 
1024241a4b83SYohann   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr);
1025241a4b83SYohann   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1026241a4b83SYohann   CeedChk(ierr);
1027241a4b83SYohann 
1028241a4b83SYohann   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1029241a4b83SYohann 
1030241a4b83SYohann   return 0;
1031241a4b83SYohann }
1032