xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision ea3392645c1709eccfcfcccd15f4375e42d1a700)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 #include <ceed-backend.h>
17 #include "ceed-cuda-gen.h"
18 #include <iostream>
19 #include <sstream>
20 #include "../cuda/ceed-cuda.h"
21 #include "../cuda-reg/ceed-cuda-reg.h"
22 #include "../cuda-shared/ceed-cuda-shared.h"
23 
24 static const char *deviceFunctions = QUOTE(
25 
26 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
27 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
28 
29 typedef struct {
30   CeedInt tidx;
31   CeedInt tidy;
32   CeedInt tidz;
33   CeedInt tid;
34   CeedScalar* slice;
35 } BackendData;
36 
37 #if __CUDA_ARCH__ < 600
38 __device__ double atomicAdd(double *address, double val) {
39   unsigned long long int *address_as_ull = (unsigned long long int *)address;
40   unsigned long long int old = *address_as_ull, assumed;
41   do {
42     assumed = old;
43     old =
44       atomicCAS(address_as_ull, assumed,
45                 __double_as_longlong(val +
46                                      __longlong_as_double(assumed)));
47     // Note: uses integer comparison to avoid hang in case of NaN
48     // (since NaN != NaN)
49   } while (assumed != old);
50   return __longlong_as_double(old);
51 }
52 #endif // __CUDA_ARCH__ < 600
53 
54 template <int P, int Q>
55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
56   for(int i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
57     B[i] = d_B[i];
58   }
59   __syncthreads;
60 }
61 
62 //****
63 // 1D
64 template <int NCOMP, int P1d>
65 inline __device__ void readDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
66   if (data.tidx<P1d)
67   {
68     const CeedInt dof = data.tidx;
69     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
70     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
71       r_u[comp] = d_u[ind + ndofs * comp];
72     }
73   }
74 }
75 
76 template <int NCOMP, int P1d>
77 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
78   if (data.tidx<P1d)
79   {
80     const CeedInt dof = data.tidx;
81     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
82     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
83       r_u[comp] = d_u[ind * NCOMP + comp];
84     }
85   }
86 }
87 
88 template <int NCOMP, int Q1d>
89 inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
90   const CeedInt dof = data.tidx;
91   const CeedInt ind = dof + elem * Q1d;
92   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
93     r_u[comp] = d_u[ind + nquads * comp];
94   }
95 }
96 
97 template <int NCOMP, int Q1d>
98 inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
99   const CeedInt dof = data.tidx;
100   const CeedInt ind = dof + elem * Q1d;
101   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
102     r_u[comp] = d_u[ind * NCOMP + comp];
103   }
104 }
105 
106 template <int NCOMP, int P1d>
107 inline __device__ void writeDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
108   if (data.tidx<P1d)
109   {
110     const CeedInt dof = data.tidx;
111     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
112     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
113       atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]);
114     }
115   }
116 }
117 
118 template <int NCOMP, int P1d>
119 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
120   if (data.tidx<P1d)
121   {
122     const CeedInt dof = data.tidx;
123     const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
124     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
125       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
126     }
127   }
128 }
129 
130 template <int NCOMP, int Q1d>
131 inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
132   const CeedInt dof = data.tidx;
133   const CeedInt ind = dof + elem * Q1d;
134   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
135     d_v[ind + nquads * comp] = r_v[comp];
136   }
137 }
138 
139 template <int NCOMP, int Q1d>
140 inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
141   const CeedInt dof = data.tidx;
142   const CeedInt ind = dof + elem * Q1d;
143   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
144     d_v[ind * NCOMP + comp] = r_v[comp];
145   }
146 }
147 
148 template <int NCOMP, int P1d, int Q1d>
149 inline __device__ void ContractX1d(BackendData& data,
150                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
151   data.slice[data.tidx] = *U;
152   __syncthreads();
153   *V = 0.0;
154   for (int i = 0; i < P1d; ++i) {
155     *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
156   }
157   __syncthreads();
158 }
159 
160 template <int NCOMP, int P1d, int Q1d>
161 inline __device__ void ContractTransposeX1d(BackendData& data,
162                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
163   data.slice[data.tidx] = *U;
164   __syncthreads();
165   *V = 0.0;
166   for (int i = 0; i < Q1d; ++i) {
167     *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
168   }
169   __syncthreads();
170 }
171 
172 template <int NCOMP, int P1d, int Q1d>
173 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
174                                 CeedScalar *__restrict__ r_V) {
175   for(int comp=0; comp<NCOMP; comp++) {
176     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
177   }
178 }
179 
180 template <int NCOMP, int P1d, int Q1d>
181 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
182                                 CeedScalar *__restrict__ r_V) {
183   for(int comp=0; comp<NCOMP; comp++) {
184     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
185   }
186 }
187 
188 template <int NCOMP, int P1d, int Q1d>
189 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
190                               const CeedScalar *c_B, const CeedScalar *c_G,
191                               CeedScalar *__restrict__ r_V) {
192   for(int comp=0; comp<NCOMP; comp++) {
193     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
194   }
195 }
196 
197 template <int NCOMP, int P1d, int Q1d>
198 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
199                               const CeedScalar *c_B, const CeedScalar *c_G,
200                               CeedScalar *__restrict__ r_V) {
201   for(int comp=0; comp<NCOMP; comp++) {
202     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
203   }
204 }
205 
206 //****
207 // 2D
208 template <int NCOMP, int P1d>
209 inline __device__ void readDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
210   if (data.tidx<P1d && data.tidy<P1d)
211   {
212     const CeedInt dof = data.tidx + data.tidy*P1d;
213     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
214     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
215       r_u[comp] = d_u[ind + ndofs * comp];
216     }
217   }
218 }
219 
220 template <int NCOMP, int P1d>
221 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
222   if (data.tidx<P1d && data.tidy<P1d)
223   {
224     const CeedInt dof = data.tidx + data.tidy*P1d;
225     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
226     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
227       r_u[comp] = d_u[ind * NCOMP + comp];
228     }
229   }
230 }
231 
232 template <int NCOMP, int Q1d>
233 inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
234   const CeedInt dof = data.tidx + data.tidy*Q1d;
235   const CeedInt ind = dof + elem * Q1d*Q1d;
236   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
237     r_u[comp] = d_u[ind + nquads * comp];
238   }
239 }
240 
241 template <int NCOMP, int Q1d>
242 inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
243   const CeedInt dof = data.tidx + data.tidy*Q1d;
244   const CeedInt ind = dof + elem * Q1d*Q1d;
245   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
246     r_u[comp] = d_u[ind * NCOMP + comp];
247   }
248 }
249 
250 template <int NCOMP, int P1d>
251 inline __device__ void writeDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
252   if (data.tidx<P1d && data.tidy<P1d)
253   {
254     const CeedInt dof = data.tidx + data.tidy*P1d;
255     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
256     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
257       atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]);
258     }
259   }
260 }
261 
262 template <int NCOMP, int P1d>
263 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
264   if (data.tidx<P1d && data.tidy<P1d)
265   {
266     const CeedInt dof = data.tidx + data.tidy*P1d;
267     const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
268     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
269       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
270     }
271   }
272 }
273 
274 template <int NCOMP, int Q1d>
275 inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
276   const CeedInt dof = data.tidx + data.tidy*Q1d;
277   const CeedInt ind = dof + elem * Q1d*Q1d;
278   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
279     d_v[ind + nquads * comp] = r_v[comp];
280   }
281 }
282 
283 template <int NCOMP, int Q1d>
284 inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
285   const CeedInt dof = data.tidx + data.tidy*Q1d;
286   const CeedInt ind = dof + elem * Q1d*Q1d;
287   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
288     d_v[ind * NCOMP + comp] = r_v[comp];
289   }
290 }
291 
292 template <int NCOMP, int P1d, int Q1d>
293 inline __device__ void ContractX2d(BackendData& data,
294                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
295   data.slice[data.tidx+data.tidy*Q1d] = *U;
296   __syncthreads();
297   *V = 0.0;
298   for (int i = 0; i < P1d; ++i) {
299     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
300   }
301   __syncthreads();
302 }
303 
304 template <int NCOMP, int P1d, int Q1d>
305 inline __device__ void ContractY2d(BackendData& data,
306                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
307   data.slice[data.tidx+data.tidy*Q1d] = *U;
308   __syncthreads();
309   *V = 0.0;
310   for (int i = 0; i < P1d; ++i) {
311     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
312   }
313   __syncthreads();
314 }
315 
316 template <int NCOMP, int P1d, int Q1d>
317 inline __device__ void ContractYTranspose2d(BackendData& data,
318                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
319   data.slice[data.tidx+data.tidy*Q1d] = *U;
320   __syncthreads();
321   *V = 0.0;
322   if (data.tidy<P1d) {
323     for (int i = 0; i < Q1d; ++i) {
324       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
325     }
326   }
327   __syncthreads();
328 }
329 
330 template <int NCOMP, int P1d, int Q1d>
331 inline __device__ void ContractXTranspose2d(BackendData& data,
332                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
333   data.slice[data.tidx+data.tidy*Q1d] = *U;
334   __syncthreads();
335   *V = 0.0;
336   if (data.tidx<P1d) {
337     for (int i = 0; i < Q1d; ++i) {
338       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
339     }
340   }
341   __syncthreads();
342 }
343 
344 template <int NCOMP, int P1d, int Q1d>
345 inline __device__ void ContractXTransposeAdd2d(BackendData& data,
346                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
347   data.slice[data.tidx+data.tidy*Q1d] = *U;
348   __syncthreads();
349   if (data.tidx<P1d) {
350     for (int i = 0; i < Q1d; ++i) {
351       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
352     }
353   }
354   __syncthreads();
355 }
356 
357 template <int NCOMP, int P1d, int Q1d>
358 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
359                                 CeedScalar *__restrict__ r_V) {
360   CeedScalar r_t[1];
361   for(int comp=0; comp<NCOMP; comp++) {
362     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
363     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
364   }
365 }
366 
367 template <int NCOMP, int P1d, int Q1d>
368 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
369                                 CeedScalar *__restrict__ r_V) {
370   CeedScalar r_t[1];
371   for(int comp=0; comp<NCOMP; comp++) {
372     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
373     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
374   }
375 }
376 
377 template <int NCOMP, int P1d, int Q1d>
378 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
379                               const CeedScalar *c_B, const CeedScalar *c_G,
380                               CeedScalar *__restrict__ r_V) {
381   CeedScalar r_t[1];
382   for(int comp=0; comp<NCOMP; comp++) {
383     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
384     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
385     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
386     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
387   }
388 }
389 
390 template <int NCOMP, int P1d, int Q1d>
391 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
392                               const CeedScalar *c_B, const CeedScalar *c_G,
393                               CeedScalar *__restrict__ r_V) {
394   CeedScalar r_t[1];
395   for(int comp=0; comp<NCOMP; comp++) {
396     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
397     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
398     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
399     ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
400   }
401 }
402 
403 //****
404 // 3D
405 template <int NCOMP, int P1d>
406 inline __device__ void readDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
407   if (data.tidx<P1d && data.tidy<P1d) {
408     for (CeedInt z = 0; z < P1d; ++z) {
409       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
410       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
411       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
412         r_u[z+comp*P1d] = d_u[ind + ndofs * comp];
413       }
414     }
415   }
416 }
417 
418 template <int NCOMP, int P1d>
419 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
420   if (data.tidx<P1d && data.tidy<P1d) {
421     for (CeedInt z = 0; z < P1d; ++z) {
422       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
423       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
424       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
425         r_u[z+comp*P1d] = d_u[ind * NCOMP + comp];
426       }
427     }
428   }
429 }
430 
431 template <int NCOMP, int Q1d>
432 inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
433   for(CeedInt z=0; z < Q1d; ++z) {
434     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
435     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
436     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
437       r_u[z+comp*Q1d] = d_u[ind + nquads * comp];
438     }
439   }
440 }
441 
442 template <int NCOMP, int Q1d>
443 inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
444   for(CeedInt z=0; z < Q1d; ++z) {
445     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
446     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
447     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
448       r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp];
449     }
450   }
451 }
452 
453 template <int NCOMP, int P1d>
454 inline __device__ void writeDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
455   if (data.tidx<P1d && data.tidy<P1d) {
456     for (CeedInt z = 0; z < P1d; ++z) {
457       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
458       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
459       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
460         atomicAdd(&d_v[ind + ndofs * comp], r_v[z+comp*P1d]);
461       }
462     }
463   }
464 }
465 
466 template <int NCOMP, int P1d>
467 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
468   if (data.tidx<P1d && data.tidy<P1d) {
469     for (CeedInt z = 0; z < P1d; ++z) {
470       const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
471       const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
472       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
473         atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]);
474       }
475     }
476   }
477 }
478 
479 template <int NCOMP, int Q1d>
480 inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
481   for(CeedInt z=0; z < Q1d; ++z) {
482     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
483     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
484     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
485       d_v[ind + nquads * comp] = r_v[z+comp*Q1d];
486     }
487   }
488 }
489 
490 template <int NCOMP, int Q1d>
491 inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
492   for(CeedInt z=0; z < Q1d; ++z) {
493     const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
494     const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
495     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
496       d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d];
497     }
498   }
499 }
500 
501 template <int NCOMP, int P1d, int Q1d>
502 inline __device__ void ContractX3d(BackendData& data,
503                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
504   for (int k = 0; k < P1d; ++k) {
505     data.slice[data.tidx+data.tidy*Q1d] = U[k];
506     __syncthreads();
507     V[k] = 0.0;
508     for (int i = 0; i < P1d; ++i) {
509       V[k] += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
510     }
511     __syncthreads();
512   }
513 }
514 
515 template <int NCOMP, int P1d, int Q1d>
516 inline __device__ void ContractY3d(BackendData& data,
517                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
518   for (int k = 0; k < P1d; ++k) {
519     data.slice[data.tidx+data.tidy*Q1d] = U[k];
520     __syncthreads();
521     V[k] = 0.0;
522     for (int i = 0; i < P1d; ++i) {
523       V[k] += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
524     }
525     __syncthreads();
526   }
527 }
528 
529 template <int NCOMP, int P1d, int Q1d>
530 inline __device__ void ContractZ3d(BackendData& data,
531                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
532   for (int k = 0; k < Q1d; ++k) {
533     V[k] = 0.0;
534     for (int i = 0; i < P1d; ++i) {
535       V[k] += B[i + k*P1d] * U[i];//contract z direction
536     }
537   }
538 }
539 
540 template <int NCOMP, int P1d, int Q1d>
541 inline __device__ void ContractTransposeZ3d(BackendData& data,
542     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
543   for (int k = 0; k < Q1d; ++k) {
544     V[k] = 0.0;
545     if (k<P1d) {
546       for (int i = 0; i < Q1d; ++i) {
547         V[k] += B[k + i*P1d] * U[i];//contract z direction
548       }
549     }
550   }
551 }
552 
553 template <int NCOMP, int P1d, int Q1d>
554 inline __device__ void ContractTransposeY3d(BackendData& data,
555     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
556   for (int k = 0; k < P1d; ++k) {
557     data.slice[data.tidx+data.tidy*Q1d] = U[k];
558     __syncthreads();
559     V[k] = 0.0;
560     if (data.tidy<P1d) {
561       for (int i = 0; i < Q1d; ++i) {
562         V[k] += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
563       }
564     }
565     __syncthreads();
566   }
567 }
568 
569 template <int NCOMP, int P1d, int Q1d>
570 inline __device__ void ContractTransposeX3d(BackendData& data,
571     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
572   for (int k = 0; k < P1d; ++k) {
573     data.slice[data.tidx+data.tidy*Q1d] = U[k];
574     __syncthreads();
575     V[k] = 0.0;
576     if (data.tidx<P1d) {
577       for (int i = 0; i < Q1d; ++i) {
578         V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
579       }
580     }
581     __syncthreads();
582   }
583 }
584 
585 template <int NCOMP, int P1d, int Q1d>
586 inline __device__ void ContractTransposeAddX3d(BackendData& data,
587     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
588   for (int k = 0; k < P1d; ++k) {
589     data.slice[data.tidx+data.tidy*Q1d] = U[k];
590     __syncthreads();
591     if (data.tidx<P1d) {
592       for (int i = 0; i < Q1d; ++i) {
593         V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
594       }
595     }
596     __syncthreads();
597   }
598 }
599 
600 template <int NCOMP, int P1d, int Q1d>
601 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
602                                 CeedScalar *__restrict__ r_V) {
603   CeedScalar r_t1[Q1d];
604   CeedScalar r_t2[Q1d];
605   for(int comp=0; comp<NCOMP; comp++) {
606     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
607     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
608     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
609   }
610 }
611 
612 template <int NCOMP, int P1d, int Q1d>
613 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
614                                 CeedScalar *__restrict__ r_V) {
615   CeedScalar r_t1[Q1d];
616   CeedScalar r_t2[Q1d];
617   for(int comp=0; comp<NCOMP; comp++) {
618     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
619     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
620     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
621   }
622 }
623 
624 template <int NCOMP, int P1d, int Q1d>
625 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
626                               const CeedScalar *c_B, const CeedScalar *c_G,
627                               CeedScalar *__restrict__ r_V) {
628   CeedScalar r_t1[Q1d];
629   CeedScalar r_t2[Q1d];
630   for(int comp=0; comp<NCOMP; comp++) {
631     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
632     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
633     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
634     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
635     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
636     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
637     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
638     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
639     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
640   }
641 }
642 
643 template <int NCOMP, int P1d, int Q1d>
644 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
645                               const CeedScalar *c_B, const CeedScalar *c_G,
646                               CeedScalar *__restrict__ r_V) {
647   CeedScalar r_t1[Q1d];
648   CeedScalar r_t2[Q1d];
649   for(int comp=0; comp<NCOMP; comp++) {
650     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
651     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
652     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
653     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
654     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
655     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
656     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
657     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
658     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
659   }
660 }
661 
662 template <int Q1d>
663 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
664   *w = qweight1d[data.tidx];
665 }
666 
667 template <int Q1d>
668 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
669   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
670 }
671 
672 template <int Q1d>
673 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
674   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
675   for (int z = 0; z < Q1d; ++z)
676   {
677     w[z] = pw*qweight1d[z];
678   }
679 }
680 
681 );
682 
683 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
684 
685 	using std::ostringstream;
686   using std::string;
687   int ierr;
688   bool setupdone;
689   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
690   if (setupdone) return 0;
691   Ceed ceed;
692   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
693   CeedOperator_Cuda_gen *data;
694   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
695   CeedQFunction qf;
696   CeedQFunction_Cuda_gen *qf_data;
697   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
698   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
699   CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, ndof;
700   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
701   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
702   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
703   CeedChk(ierr);
704   CeedOperatorField *opinputfields, *opoutputfields;
705   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
706   CeedChk(ierr);
707   CeedQFunctionField *qfinputfields, *qfoutputfields;
708   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
709   CeedChk(ierr);
710   CeedEvalMode emode;
711   CeedTransposeMode lmode;
712   CeedBasis basis;
713   CeedBasis_Cuda_shared *basis_data;
714   CeedElemRestriction Erestrict;
715   CeedElemRestriction_Cuda_reg *restr_data;
716 
717   ostringstream code;
718   string devFunctions(deviceFunctions);
719 
720   code << devFunctions;
721 
722   string qFunction(qf_data->qFunctionSource);
723   code << qFunction;
724 
725   // Setup
726   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
727   // Input Evecs and Restriction
728   for (CeedInt i = 0; i < numinputfields; i++) {
729     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
730     CeedChk(ierr);
731     if (emode == CEED_EVAL_WEIGHT) { // Skip
732     } else {
733       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
734       if (emode != CEED_EVAL_NONE)
735       {
736         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
737         bool isTensor;
738         ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
739         //TODO check that all are the same
740         ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
741         if (isTensor)
742         {
743           //TODO check that all are the same
744           ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
745         } else {
746           return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
747         }
748       }
749     }
750   }
751   data->dim = dim;
752   data->Q1d = Q1d;
753 
754   for (CeedInt i = 0; i < numoutputfields; i++) {
755     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
756   }
757   code << "const CeedInt Dim = "<<dim<<";\n";
758   code << "const CeedInt Q1d = "<<Q1d<<";\n";
759   // code << "const CeedInt Q   = "<<Q<<";\n";
760   code << "extern __shared__ CeedScalar slice[];\n";
761   code << "BackendData data;\n";
762   code << "data.tidx = threadIdx.x;\n";
763   code << "data.tidy = threadIdx.y;\n";
764   code << "data.tidz = threadIdx.z;\n";
765   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
766   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
767   for (CeedInt i = 0; i < numinputfields; i++) {
768     code << "// Input field "<<i<<"\n";
769     // Get elemsize, emode, ncomp
770     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
771     CeedChk(ierr);
772     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
773     CeedChk(ierr);
774     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
775     CeedChk(ierr);
776     ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
777     CeedChk(ierr);
778     // Basis action
779     switch (emode) {
780     case CEED_EVAL_NONE:
781       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
782       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
783       code << "  const CeedInt nquads_in_"<<i<<" = "<<ndof<<";\n";
784       break;
785     case CEED_EVAL_INTERP:
786       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
787       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
788       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
789       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
790       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
791       code << "  const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n";
792       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
793       data->B.in[i] = basis_data->d_interp1d;
794       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
795       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
796       break;
797     case CEED_EVAL_GRAD:
798       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
799       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
800       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
801       code << "  const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n";
802       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
803       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
804       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
805       data->B.in[i] = basis_data->d_interp1d;
806       data->G.in[i] = basis_data->d_grad1d;
807       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
808       code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
809       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
810       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
811       break;
812     case CEED_EVAL_WEIGHT:
813       break; // No action
814     case CEED_EVAL_DIV:
815       break; // TODO: Not implemented
816     case CEED_EVAL_CURL:
817       break; // TODO: Not implemented
818     }
819   }
820   for (CeedInt i = 0; i < numoutputfields; i++) {
821     code << "// Output field "<<i<<"\n";
822     // Get elemsize, emode, ncomp
823     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
824     CeedChk(ierr);
825     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
826     CeedChk(ierr);
827     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
828     CeedChk(ierr);
829     ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
830     CeedChk(ierr);
831     // Basis action
832     switch (emode) {
833     case CEED_EVAL_NONE:
834       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
835       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
836       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
837       code << "  const CeedInt nquads_out_"<<i<<" = "<<ndof<<";\n";
838       break; // No action
839     case CEED_EVAL_INTERP:
840       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
841       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
842       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
843       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
844       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
845       data->B.out[i] = basis_data->d_interp1d;
846       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
847       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
848       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
849       code << "  const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n";
850       break;
851     case CEED_EVAL_GRAD:
852       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
853       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
854       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
855       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
856       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
857       data->B.out[i] = basis_data->d_interp1d;
858       data->G.out[i] = basis_data->d_grad1d;
859       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
860       code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
861       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
862       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
863       ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
864       code << "  const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n";
865       break;
866     case CEED_EVAL_WEIGHT: {
867       Ceed ceed;
868       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
869       return CeedError(ceed, 1,
870                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
871       break; // Should not occur
872     }
873     case CEED_EVAL_DIV:
874       break; // TODO: Not implemented
875     case CEED_EVAL_CURL:
876       break; // TODO: Not implemented
877     }
878   }
879   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
880   // Input basis apply if needed
881   for (CeedInt i = 0; i < numinputfields; i++) {
882     code << "// Input field "<<i<<"\n";
883     // Get elemsize, emode, ncomp
884     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
885     CeedChk(ierr);
886     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
887     CeedChk(ierr);
888     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
889     CeedChk(ierr);
890     ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
891     CeedChk(ierr);
892     // Basis action
893     switch (emode) {
894     case CEED_EVAL_NONE:
895       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
896       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
897       code << "  readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n";
898       break;
899     case CEED_EVAL_INTERP:
900       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
901       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
902       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
903       data->indices.in[i] = restr_data->d_ind;
904       code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
905       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
906       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
907       break;
908     case CEED_EVAL_GRAD:
909       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
910       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
911       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
912       data->indices.in[i] = restr_data->d_ind;
913       code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
914       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
915       code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
916       break;
917     case CEED_EVAL_WEIGHT:
918       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
919       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
920       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
921       data->W = basis_data->d_qweight1d;
922       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
923       break; // No action
924     case CEED_EVAL_DIV:
925       break; // TODO: Not implemented
926     case CEED_EVAL_CURL:
927       break; // TODO: Not implemented
928     }
929   }
930   // Q function
931   code << "// QFunction\n";
932   for (CeedInt i = 0; i < numoutputfields; i++) {
933     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
934     CeedChk(ierr);
935     if (emode==CEED_EVAL_GRAD)
936     {
937       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
938     }
939     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
940     {
941       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
942     }
943   }
944   //TODO write qfunction load for this backend
945   string qFunctionName(qf_data->qFunctionName);
946   code << "  "<<qFunctionName<<"(ctx, "<<(dim==3?"Q1d":"1")<<", ";
947   for (CeedInt i = 0; i < numinputfields; i++) {
948     code << "r_t"<<i<<", ";
949   }
950   for (CeedInt i = 0; i < numoutputfields; i++) {
951     code << "r_tt"<<i;
952     if (i<numoutputfields-1)
953     {
954       code << ", ";
955     }
956   }
957   code << ");\n";
958 
959   // Output basis apply if needed
960   for (CeedInt i = 0; i < numoutputfields; i++) {
961     code << "// Output field "<<i<<"\n";
962     // Get elemsize, emode, ncomp
963     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
964     CeedChk(ierr);
965     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
966     CeedChk(ierr);
967     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
968     CeedChk(ierr);
969     ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
970     CeedChk(ierr);
971     // Basis action
972     switch (emode) {
973     case CEED_EVAL_NONE:
974       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
975       code << "  writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n";
976       break; // No action
977     case CEED_EVAL_INTERP:
978       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
979       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
980       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
981       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
982       data->indices.out[i] = restr_data->d_ind;
983       code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
984       break;
985     case CEED_EVAL_GRAD:
986       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
987       code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
988       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
989       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
990       data->indices.out[i] = restr_data->d_ind;
991       code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
992       break;
993     case CEED_EVAL_WEIGHT: {
994       Ceed ceed;
995       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
996       return CeedError(ceed, 1,
997                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
998       break; // Should not occur
999     }
1000     case CEED_EVAL_DIV:
1001       break; // TODO: Not implemented
1002     case CEED_EVAL_CURL:
1003       break; // TODO: Not implemented
1004     }
1005   }
1006 
1007   code << "  }\n";
1008   code << "}\n\n";
1009 
1010   // std::cout << code.str();
1011 
1012   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr);
1013   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1014   CeedChk(ierr);
1015 
1016   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1017 
1018   return 0;
1019 }
1020