xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 288c044332e33f37503f09b6484fec9d0a55fba1)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 #include <ceed-backend.h>
17 #include "ceed-cuda-gen.h"
18 #include <iostream>
19 #include <sstream>
20 #include "../cuda/ceed-cuda.h"
21 #include "../cuda-reg/ceed-cuda-reg.h"
22 #include "../cuda-shared/ceed-cuda-shared.h"
23 
24 static const char *atomicAdd = QUOTE(
25 __device__ double atomicAdd(double *address, double val) {
26   unsigned long long int *address_as_ull = (unsigned long long int *)address;
27   unsigned long long int old = *address_as_ull, assumed;
28   do {
29     assumed = old;
30     old =
31       atomicCAS(address_as_ull, assumed,
32                 __double_as_longlong(val +
33                                      __longlong_as_double(assumed)));
34     // Note: uses integer comparison to avoid hang in case of NaN
35     // (since NaN != NaN)
36   } while (assumed != old);
37   return __longlong_as_double(old);
38 }
39 );
40 
41 static const char *deviceFunctions = QUOTE(
42 
43 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
44 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
45 
46 typedef struct {
47   CeedInt tidx;
48   CeedInt tidy;
49   CeedInt tidz;
50   CeedInt tid;
51   CeedScalar* slice;
52 } BackendData;
53 
54 template <int P, int Q>
55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
56   for(int i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
57     B[i] = d_B[i];
58   }
59 }
60 
61 //****
62 // 1D
63 template <int NCOMP, int P1d>
64 inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
65   if (data.tidx<P1d)
66   {
67     const CeedInt node = data.tidx;
68     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
69     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
70       r_u[comp] = d_u[ind + nnodes * comp];
71     }
72   }
73 }
74 
75 template <int NCOMP, int P1d>
76 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
77   if (data.tidx<P1d)
78   {
79     const CeedInt node = data.tidx;
80     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
81     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
82       r_u[comp] = d_u[ind * NCOMP + comp];
83     }
84   }
85 }
86 
87 template <int NCOMP, int Q1d>
88 inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
89   const CeedInt node = data.tidx;
90   const CeedInt ind = node + elem * Q1d;
91   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
92     r_u[comp] = d_u[ind + nquads * comp];
93   }
94 }
95 
96 template <int NCOMP, int Q1d>
97 inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
98   const CeedInt node = data.tidx;
99   const CeedInt ind = node + elem * Q1d;
100   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
101     r_u[comp] = d_u[ind * NCOMP + comp];
102   }
103 }
104 
105 template <int NCOMP, int P1d>
106 inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
107   if (data.tidx<P1d)
108   {
109     const CeedInt node = data.tidx;
110     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
111     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
112       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
113     }
114   }
115 }
116 
117 template <int NCOMP, int P1d>
118 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
119   if (data.tidx<P1d)
120   {
121     const CeedInt node = data.tidx;
122     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
123     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
124       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
125     }
126   }
127 }
128 
129 template <int NCOMP, int Q1d>
130 inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
131   const CeedInt node = data.tidx;
132   const CeedInt ind = node + elem * Q1d;
133   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
134     d_v[ind + nquads * comp] = r_v[comp];
135   }
136 }
137 
138 template <int NCOMP, int Q1d>
139 inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
140   const CeedInt node = data.tidx;
141   const CeedInt ind = node + elem * Q1d;
142   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
143     d_v[ind * NCOMP + comp] = r_v[comp];
144   }
145 }
146 
147 template <int NCOMP, int P1d, int Q1d>
148 inline __device__ void ContractX1d(BackendData& data,
149                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
150   data.slice[data.tidx] = *U;
151   __syncthreads();
152   *V = 0.0;
153   for (int i = 0; i < P1d; ++i) {
154     *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
155   }
156   __syncthreads();
157 }
158 
159 template <int NCOMP, int P1d, int Q1d>
160 inline __device__ void ContractTransposeX1d(BackendData& data,
161                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
162   data.slice[data.tidx] = *U;
163   __syncthreads();
164   *V = 0.0;
165   for (int i = 0; i < Q1d; ++i) {
166     *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
167   }
168   __syncthreads();
169 }
170 
171 template <int NCOMP, int P1d, int Q1d>
172 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
173                                 CeedScalar *__restrict__ r_V) {
174   for(int comp=0; comp<NCOMP; comp++) {
175     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
176   }
177 }
178 
179 template <int NCOMP, int P1d, int Q1d>
180 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
181                                 CeedScalar *__restrict__ r_V) {
182   for(int comp=0; comp<NCOMP; comp++) {
183     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
184   }
185 }
186 
187 template <int NCOMP, int P1d, int Q1d>
188 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
189                               const CeedScalar *c_B, const CeedScalar *c_G,
190                               CeedScalar *__restrict__ r_V) {
191   for(int comp=0; comp<NCOMP; comp++) {
192     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
193   }
194 }
195 
196 template <int NCOMP, int P1d, int Q1d>
197 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
198                               const CeedScalar *c_B, const CeedScalar *c_G,
199                               CeedScalar *__restrict__ r_V) {
200   for(int comp=0; comp<NCOMP; comp++) {
201     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
202   }
203 }
204 
205 //****
206 // 2D
207 template <int NCOMP, int P1d>
208 inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
209   if (data.tidx<P1d && data.tidy<P1d)
210   {
211     const CeedInt node = data.tidx + data.tidy*P1d;
212     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
213     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
214       r_u[comp] = d_u[ind + nnodes * comp];
215     }
216   }
217 }
218 
219 template <int NCOMP, int P1d>
220 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
221   if (data.tidx<P1d && data.tidy<P1d)
222   {
223     const CeedInt node = data.tidx + data.tidy*P1d;
224     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
225     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
226       r_u[comp] = d_u[ind * NCOMP + comp];
227     }
228   }
229 }
230 
231 template <int NCOMP, int Q1d>
232 inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
233   const CeedInt node = data.tidx + data.tidy*Q1d;
234   const CeedInt ind = node + elem * Q1d*Q1d;
235   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
236     r_u[comp] = d_u[ind + nquads * comp];
237   }
238 }
239 
240 template <int NCOMP, int Q1d>
241 inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
242   const CeedInt node = data.tidx + data.tidy*Q1d;
243   const CeedInt ind = node + elem * Q1d*Q1d;
244   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
245     r_u[comp] = d_u[ind * NCOMP + comp];
246   }
247 }
248 
249 template <int NCOMP, int P1d>
250 inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
251   if (data.tidx<P1d && data.tidy<P1d)
252   {
253     const CeedInt node = data.tidx + data.tidy*P1d;
254     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
255     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
256       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
257     }
258   }
259 }
260 
261 template <int NCOMP, int P1d>
262 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
263   if (data.tidx<P1d && data.tidy<P1d)
264   {
265     const CeedInt node = data.tidx + data.tidy*P1d;
266     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
267     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
268       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
269     }
270   }
271 }
272 
273 template <int NCOMP, int Q1d>
274 inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
275   const CeedInt node = data.tidx + data.tidy*Q1d;
276   const CeedInt ind = node + elem * Q1d*Q1d;
277   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
278     d_v[ind + nquads * comp] = r_v[comp];
279   }
280 }
281 
282 template <int NCOMP, int Q1d>
283 inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
284   const CeedInt node = data.tidx + data.tidy*Q1d;
285   const CeedInt ind = node + elem * Q1d*Q1d;
286   for(CeedInt comp = 0; comp < NCOMP; ++comp) {
287     d_v[ind * NCOMP + comp] = r_v[comp];
288   }
289 }
290 
291 template <int NCOMP, int P1d, int Q1d>
292 inline __device__ void ContractX2d(BackendData& data,
293                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
294   data.slice[data.tidx+data.tidy*Q1d] = *U;
295   __syncthreads();
296   *V = 0.0;
297   for (int i = 0; i < P1d; ++i) {
298     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
299   }
300   __syncthreads();
301 }
302 
303 template <int NCOMP, int P1d, int Q1d>
304 inline __device__ void ContractY2d(BackendData& data,
305                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
306   data.slice[data.tidx+data.tidy*Q1d] = *U;
307   __syncthreads();
308   *V = 0.0;
309   for (int i = 0; i < P1d; ++i) {
310     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
311   }
312   __syncthreads();
313 }
314 
315 template <int NCOMP, int P1d, int Q1d>
316 inline __device__ void ContractYTranspose2d(BackendData& data,
317                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
318   data.slice[data.tidx+data.tidy*Q1d] = *U;
319   __syncthreads();
320   *V = 0.0;
321   if (data.tidy<P1d) {
322     for (int i = 0; i < Q1d; ++i) {
323       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
324     }
325   }
326   __syncthreads();
327 }
328 
329 template <int NCOMP, int P1d, int Q1d>
330 inline __device__ void ContractXTranspose2d(BackendData& data,
331                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
332   data.slice[data.tidx+data.tidy*Q1d] = *U;
333   __syncthreads();
334   *V = 0.0;
335   if (data.tidx<P1d) {
336     for (int i = 0; i < Q1d; ++i) {
337       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
338     }
339   }
340   __syncthreads();
341 }
342 
343 template <int NCOMP, int P1d, int Q1d>
344 inline __device__ void ContractXTransposeAdd2d(BackendData& data,
345                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
346   data.slice[data.tidx+data.tidy*Q1d] = *U;
347   __syncthreads();
348   if (data.tidx<P1d) {
349     for (int i = 0; i < Q1d; ++i) {
350       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
351     }
352   }
353   __syncthreads();
354 }
355 
356 template <int NCOMP, int P1d, int Q1d>
357 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
358                                 CeedScalar *__restrict__ r_V) {
359   CeedScalar r_t[1];
360   for(int comp=0; comp<NCOMP; comp++) {
361     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
362     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
363   }
364 }
365 
366 template <int NCOMP, int P1d, int Q1d>
367 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
368                                 CeedScalar *__restrict__ r_V) {
369   CeedScalar r_t[1];
370   for(int comp=0; comp<NCOMP; comp++) {
371     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
372     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
373   }
374 }
375 
376 template <int NCOMP, int P1d, int Q1d>
377 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
378                               const CeedScalar *c_B, const CeedScalar *c_G,
379                               CeedScalar *__restrict__ r_V) {
380   CeedScalar r_t[1];
381   for(int comp=0; comp<NCOMP; comp++) {
382     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
383     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
384     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
385     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
386   }
387 }
388 
389 template <int NCOMP, int P1d, int Q1d>
390 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
391                               const CeedScalar *c_B, const CeedScalar *c_G,
392                               CeedScalar *__restrict__ r_V) {
393   CeedScalar r_t[1];
394   for(int comp=0; comp<NCOMP; comp++) {
395     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
396     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
397     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
398     ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
399   }
400 }
401 
402 //****
403 // 3D
404 template <int NCOMP, int P1d>
405 inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
406   if (data.tidx<P1d && data.tidy<P1d) {
407     for (CeedInt z = 0; z < P1d; ++z) {
408       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
409       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
410       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
411         r_u[z+comp*P1d] = d_u[ind + nnodes * comp];
412       }
413     }
414   }
415 }
416 
417 template <int NCOMP, int P1d>
418 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
419   if (data.tidx<P1d && data.tidy<P1d) {
420     for (CeedInt z = 0; z < P1d; ++z) {
421       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
422       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
423       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
424         r_u[z+comp*P1d] = d_u[ind * NCOMP + comp];
425       }
426     }
427   }
428 }
429 
430 template <int NCOMP, int Q1d>
431 inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
432   for(CeedInt z=0; z < Q1d; ++z) {
433     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
434     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
435     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
436       r_u[z+comp*Q1d] = d_u[ind + nquads * comp];
437     }
438   }
439 }
440 
441 template <int NCOMP, int Q1d>
442 inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
443   for(CeedInt z=0; z < Q1d; ++z) {
444     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
445     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
446     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
447       r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp];
448     }
449   }
450 }
451 
452 template <int NCOMP, int P1d>
453 inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
454   if (data.tidx<P1d && data.tidy<P1d) {
455     for (CeedInt z = 0; z < P1d; ++z) {
456       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
457       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
458       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
459         atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]);
460       }
461     }
462   }
463 }
464 
465 template <int NCOMP, int P1d>
466 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
467   if (data.tidx<P1d && data.tidy<P1d) {
468     for (CeedInt z = 0; z < P1d; ++z) {
469       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
470       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
471       for(CeedInt comp = 0; comp < NCOMP; ++comp) {
472         atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]);
473       }
474     }
475   }
476 }
477 
478 template <int NCOMP, int Q1d>
479 inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
480   for(CeedInt z=0; z < Q1d; ++z) {
481     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
482     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
483     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
484       d_v[ind + nquads * comp] = r_v[z+comp*Q1d];
485     }
486   }
487 }
488 
489 template <int NCOMP, int Q1d>
490 inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
491   for(CeedInt z=0; z < Q1d; ++z) {
492     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
493     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
494     for(CeedInt comp = 0; comp < NCOMP; ++comp) {
495       d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d];
496     }
497   }
498 }
499 
500 template <int NCOMP, int P1d, int Q1d>
501 inline __device__ void ContractX3d(BackendData& data,
502                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
503   for (int k = 0; k < P1d; ++k) {
504     data.slice[data.tidx+data.tidy*Q1d] = U[k];
505     __syncthreads();
506     V[k] = 0.0;
507     for (int i = 0; i < P1d; ++i) {
508       V[k] += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
509     }
510     __syncthreads();
511   }
512 }
513 
514 template <int NCOMP, int P1d, int Q1d>
515 inline __device__ void ContractY3d(BackendData& data,
516                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
517   for (int k = 0; k < P1d; ++k) {
518     data.slice[data.tidx+data.tidy*Q1d] = U[k];
519     __syncthreads();
520     V[k] = 0.0;
521     for (int i = 0; i < P1d; ++i) {
522       V[k] += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
523     }
524     __syncthreads();
525   }
526 }
527 
528 template <int NCOMP, int P1d, int Q1d>
529 inline __device__ void ContractZ3d(BackendData& data,
530                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
531   for (int k = 0; k < Q1d; ++k) {
532     V[k] = 0.0;
533     for (int i = 0; i < P1d; ++i) {
534       V[k] += B[i + k*P1d] * U[i];//contract z direction
535     }
536   }
537 }
538 
539 template <int NCOMP, int P1d, int Q1d>
540 inline __device__ void ContractTransposeZ3d(BackendData& data,
541     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
542   for (int k = 0; k < Q1d; ++k) {
543     V[k] = 0.0;
544     if (k<P1d) {
545       for (int i = 0; i < Q1d; ++i) {
546         V[k] += B[k + i*P1d] * U[i];//contract z direction
547       }
548     }
549   }
550 }
551 
552 template <int NCOMP, int P1d, int Q1d>
553 inline __device__ void ContractTransposeY3d(BackendData& data,
554     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
555   for (int k = 0; k < P1d; ++k) {
556     data.slice[data.tidx+data.tidy*Q1d] = U[k];
557     __syncthreads();
558     V[k] = 0.0;
559     if (data.tidy<P1d) {
560       for (int i = 0; i < Q1d; ++i) {
561         V[k] += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
562       }
563     }
564     __syncthreads();
565   }
566 }
567 
568 template <int NCOMP, int P1d, int Q1d>
569 inline __device__ void ContractTransposeX3d(BackendData& data,
570     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
571   for (int k = 0; k < P1d; ++k) {
572     data.slice[data.tidx+data.tidy*Q1d] = U[k];
573     __syncthreads();
574     V[k] = 0.0;
575     if (data.tidx<P1d) {
576       for (int i = 0; i < Q1d; ++i) {
577         V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
578       }
579     }
580     __syncthreads();
581   }
582 }
583 
584 template <int NCOMP, int P1d, int Q1d>
585 inline __device__ void ContractTransposeAddX3d(BackendData& data,
586     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
587   for (int k = 0; k < P1d; ++k) {
588     data.slice[data.tidx+data.tidy*Q1d] = U[k];
589     __syncthreads();
590     if (data.tidx<P1d) {
591       for (int i = 0; i < Q1d; ++i) {
592         V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
593       }
594     }
595     __syncthreads();
596   }
597 }
598 
599 template <int NCOMP, int P1d, int Q1d>
600 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
601                                 CeedScalar *__restrict__ r_V) {
602   CeedScalar r_t1[Q1d];
603   CeedScalar r_t2[Q1d];
604   for(int comp=0; comp<NCOMP; comp++) {
605     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
606     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
607     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
608   }
609 }
610 
611 template <int NCOMP, int P1d, int Q1d>
612 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
613                                 CeedScalar *__restrict__ r_V) {
614   CeedScalar r_t1[Q1d];
615   CeedScalar r_t2[Q1d];
616   for(int comp=0; comp<NCOMP; comp++) {
617     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
618     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
619     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
620   }
621 }
622 
623 template <int NCOMP, int P1d, int Q1d>
624 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
625                               const CeedScalar *c_B, const CeedScalar *c_G,
626                               CeedScalar *__restrict__ r_V) {
627   CeedScalar r_t1[Q1d];
628   CeedScalar r_t2[Q1d];
629   for(int comp=0; comp<NCOMP; comp++) {
630     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
631     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
632     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
633     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
634     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
635     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
636     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
637     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
638     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
639   }
640 }
641 
642 template <int NCOMP, int P1d, int Q1d>
643 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
644                               const CeedScalar *c_B, const CeedScalar *c_G,
645                               CeedScalar *__restrict__ r_V) {
646   CeedScalar r_t1[Q1d];
647   CeedScalar r_t2[Q1d];
648   for(int comp=0; comp<NCOMP; comp++) {
649     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
650     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
651     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
652     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
653     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
654     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
655     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
656     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
657     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
658   }
659 }
660 
661 template <int Q1d>
662 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
663   *w = qweight1d[data.tidx];
664 }
665 
666 template <int Q1d>
667 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
668   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
669 }
670 
671 template <int Q1d>
672 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
673   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
674   for (int z = 0; z < Q1d; ++z)
675   {
676     w[z] = pw*qweight1d[z];
677   }
678 }
679 
680 );
681 
682 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
683 
684 	using std::ostringstream;
685   using std::string;
686   int ierr;
687   bool setupdone;
688   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
689   if (setupdone) return 0;
690   Ceed ceed;
691   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
692   CeedOperator_Cuda_gen *data;
693   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
694   CeedQFunction qf;
695   CeedQFunction_Cuda_gen *qf_data;
696   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
697   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
698   CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes;
699   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
700   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
701   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
702   CeedChk(ierr);
703   CeedOperatorField *opinputfields, *opoutputfields;
704   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
705   CeedChk(ierr);
706   CeedQFunctionField *qfinputfields, *qfoutputfields;
707   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
708   CeedChk(ierr);
709   CeedEvalMode emode;
710   CeedTransposeMode lmode;
711   CeedBasis basis;
712   CeedBasis_Cuda_shared *basis_data;
713   CeedElemRestriction Erestrict;
714   CeedElemRestriction_Cuda_reg *restr_data;
715 
716   ostringstream code;
717   string devFunctions(deviceFunctions);
718 
719   // Add atomicAdd function for old NVidia architectures
720   struct cudaDeviceProp prop;
721   Ceed delegate;
722   CeedGetDelegate(ceed, &delegate);
723   Ceed_Cuda *ceed_data;
724   ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr);
725   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
726   if(prop.major<6){
727     code << atomicAdd;
728   }
729 
730   code << devFunctions;
731 
732   string qFunction(qf_data->qFunctionSource);
733 
734   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
735   code << "\n#define CeedPragmaSIMD\n";
736   code << qFunction;
737 
738   // Setup
739   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
740   // Input Evecs and Restriction
741   for (CeedInt i = 0; i < numinputfields; i++) {
742     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
743     CeedChk(ierr);
744     if (emode == CEED_EVAL_WEIGHT) { // Skip
745     } else {
746       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
747       if (emode != CEED_EVAL_NONE)
748       {
749         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
750         bool isTensor;
751         ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
752         //TODO check that all are the same
753         ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
754         if (isTensor)
755         {
756           //TODO check that all are the same
757           ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
758         } else {
759           return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
760         }
761       }
762     }
763   }
764   data->dim = dim;
765   data->Q1d = Q1d;
766 
767   for (CeedInt i = 0; i < numoutputfields; i++) {
768     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
769   }
770   code << "const CeedInt Dim = "<<dim<<";\n";
771   code << "const CeedInt Q1d = "<<Q1d<<";\n";
772   // code << "const CeedInt Q   = "<<Q<<";\n";
773   code << "extern __shared__ CeedScalar slice[];\n";
774   code << "BackendData data;\n";
775   code << "data.tidx = threadIdx.x;\n";
776   code << "data.tidy = threadIdx.y;\n";
777   code << "data.tidz = threadIdx.z;\n";
778   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
779   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
780   for (CeedInt i = 0; i < numinputfields; i++) {
781     code << "// Input field "<<i<<"\n";
782     // Get elemsize, emode, ncomp
783     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
784     CeedChk(ierr);
785     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
786     CeedChk(ierr);
787     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
788     CeedChk(ierr);
789     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
790     CeedChk(ierr);
791     // Basis action
792     switch (emode) {
793     case CEED_EVAL_NONE:
794       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
795       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
796       code << "  const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n";
797       break;
798     case CEED_EVAL_INTERP:
799       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
800       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
801       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
802       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
803       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
804       code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
805       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
806       data->B.in[i] = basis_data->d_interp1d;
807       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
808       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
809       break;
810     case CEED_EVAL_GRAD:
811       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
812       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
813       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
814       code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
815       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
816       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
817       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
818       data->B.in[i] = basis_data->d_interp1d;
819       data->G.in[i] = basis_data->d_grad1d;
820       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
821       code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
822       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
823       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
824       break;
825     case CEED_EVAL_WEIGHT:
826       break; // No action
827     case CEED_EVAL_DIV:
828       break; // TODO: Not implemented
829     case CEED_EVAL_CURL:
830       break; // TODO: Not implemented
831     }
832   }
833   for (CeedInt i = 0; i < numoutputfields; i++) {
834     code << "// Output field "<<i<<"\n";
835     // Get elemsize, emode, ncomp
836     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
837     CeedChk(ierr);
838     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
839     CeedChk(ierr);
840     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
841     CeedChk(ierr);
842     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
843     CeedChk(ierr);
844     // Basis action
845     switch (emode) {
846     case CEED_EVAL_NONE:
847       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
848       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
849       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
850       code << "  const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n";
851       break; // No action
852     case CEED_EVAL_INTERP:
853       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
854       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
855       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
856       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
857       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
858       data->B.out[i] = basis_data->d_interp1d;
859       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
860       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
861       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
862       code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
863       break;
864     case CEED_EVAL_GRAD:
865       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
866       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
867       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
868       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
869       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
870       data->B.out[i] = basis_data->d_interp1d;
871       data->G.out[i] = basis_data->d_grad1d;
872       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
873       code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
874       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
875       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
876       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
877       code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
878       break;
879     case CEED_EVAL_WEIGHT: {
880       Ceed ceed;
881       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
882       return CeedError(ceed, 1,
883                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
884       break; // Should not occur
885     }
886     case CEED_EVAL_DIV:
887       break; // TODO: Not implemented
888     case CEED_EVAL_CURL:
889       break; // TODO: Not implemented
890     }
891   }
892   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
893   // Input basis apply if needed
894   for (CeedInt i = 0; i < numinputfields; i++) {
895     code << "// Input field "<<i<<"\n";
896     // Get elemsize, emode, ncomp
897     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
898     CeedChk(ierr);
899     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
900     CeedChk(ierr);
901     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
902     CeedChk(ierr);
903     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
904     CeedChk(ierr);
905     // Basis action
906     switch (emode) {
907     case CEED_EVAL_NONE:
908       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
909       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
910       code << "  readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n";
911       break;
912     case CEED_EVAL_INTERP:
913       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
914       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
915       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
916       data->indices.in[i] = restr_data->d_ind;
917       code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
918       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
919       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
920       break;
921     case CEED_EVAL_GRAD:
922       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
923       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
924       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
925       data->indices.in[i] = restr_data->d_ind;
926       code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
927       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
928       code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
929       break;
930     case CEED_EVAL_WEIGHT:
931       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
932       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
933       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
934       data->W = basis_data->d_qweight1d;
935       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
936       break; // No action
937     case CEED_EVAL_DIV:
938       break; // TODO: Not implemented
939     case CEED_EVAL_CURL:
940       break; // TODO: Not implemented
941     }
942   }
943   // Q function
944   code << "// QFunction\n";
945   for (CeedInt i = 0; i < numoutputfields; i++) {
946     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
947     CeedChk(ierr);
948     if (emode==CEED_EVAL_GRAD)
949     {
950       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
951     }
952     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
953     {
954       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
955     }
956   }
957   code << "  CeedScalar* in["<<numinputfields<<"];\n";
958   for (CeedInt i = 0; i < numinputfields; i++) {
959     code << "  in["<<i<<"] = r_t"<<i<<";\n";
960   }
961   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
962   for (CeedInt i = 0; i < numoutputfields; i++) {
963     code << "  out["<<i<<"] = r_tt"<<i<<";\n";
964   }
965   string qFunctionName(qf_data->qFunctionName);
966   code << "  "<<qFunctionName<<"(ctx, "<<(dim==3?"Q1d":"1")<<", ";
967   code << "in, out";
968   code << ");\n";
969 
970   // Output basis apply if needed
971   for (CeedInt i = 0; i < numoutputfields; i++) {
972     code << "// Output field "<<i<<"\n";
973     // Get elemsize, emode, ncomp
974     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
975     CeedChk(ierr);
976     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
977     CeedChk(ierr);
978     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
979     CeedChk(ierr);
980     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
981     CeedChk(ierr);
982     // Basis action
983     switch (emode) {
984     case CEED_EVAL_NONE:
985       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
986       code << "  writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n";
987       break; // No action
988     case CEED_EVAL_INTERP:
989       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
990       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
991       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
992       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
993       data->indices.out[i] = restr_data->d_ind;
994       code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
995       break;
996     case CEED_EVAL_GRAD:
997       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
998       code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
999       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
1000       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1001       data->indices.out[i] = restr_data->d_ind;
1002       code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1003       break;
1004     case CEED_EVAL_WEIGHT: {
1005       Ceed ceed;
1006       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1007       return CeedError(ceed, 1,
1008                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1009       break; // Should not occur
1010     }
1011     case CEED_EVAL_DIV:
1012       break; // TODO: Not implemented
1013     case CEED_EVAL_CURL:
1014       break; // TODO: Not implemented
1015     }
1016   }
1017 
1018   code << "  }\n";
1019   code << "}\n\n";
1020 
1021   // std::cout << code.str();
1022 
1023   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr);
1024   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1025   CeedChk(ierr);
1026 
1027   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1028 
1029   return 0;
1030 }
1031