xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision c042f62f62e10e1321eb699b116e67a6568d5716)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 #include <ceed-backend.h>
17 #include "ceed-cuda-gen.h"
18 #include <iostream>
19 #include <sstream>
20 #include "../cuda/ceed-cuda.h"
21 #include "../cuda-reg/ceed-cuda-reg.h"
22 #include "../cuda-shared/ceed-cuda-shared.h"
23 
24 static const char *atomicAdd = QUOTE(
25 __device__ double atomicAdd(double *address, double val) {
26   unsigned long long int *address_as_ull = (unsigned long long int *)address;
27   unsigned long long int old = *address_as_ull, assumed;
28   do {
29     assumed = old;
30     old =
31       atomicCAS(address_as_ull, assumed,
32                 __double_as_longlong(val +
33                                      __longlong_as_double(assumed)));
34     // Note: uses integer comparison to avoid hang in case of NaN
35     // (since NaN != NaN)
36   } while (assumed != old);
37   return __longlong_as_double(old);
38 }
39 );
40 
41 static const char *deviceFunctions = QUOTE(
42 
43 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
44 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
45 
46 typedef struct {
47   CeedInt tidx;
48   CeedInt tidy;
49   CeedInt tidz;
50   CeedInt tid;
51   CeedScalar* slice;
52 } BackendData;
53 
54 template <int P, int Q>
55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
56   for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
57     B[i] = d_B[i];
58   }
59 }
60 
61 //****
62 // 1D
63 template <int NCOMP, int P1d>
64 inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
65   if (data.tidx<P1d)
66   {
67     const CeedInt node = data.tidx;
68     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
69     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
70       r_u[comp] = d_u[ind + nnodes * comp];
71     }
72   }
73 }
74 
75 template <int NCOMP, int P1d>
76 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
77   if (data.tidx<P1d)
78   {
79     const CeedInt node = data.tidx;
80     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
81     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
82       r_u[comp] = d_u[ind * NCOMP + comp];
83     }
84   }
85 }
86 
87 template <int NCOMP, int Q1d>
88 inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
89   const CeedInt node = data.tidx;
90   const CeedInt ind = node + elem * Q1d;
91   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
92     r_u[comp] = d_u[ind + nquads * comp];
93   }
94 }
95 
96 template <int NCOMP, int Q1d>
97 inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
98   const CeedInt node = data.tidx;
99   const CeedInt ind = node + elem * Q1d;
100   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
101     r_u[comp] = d_u[ind * NCOMP + comp];
102   }
103 }
104 
105 template <int NCOMP, int P1d>
106 inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
107   if (data.tidx<P1d)
108   {
109     const CeedInt node = data.tidx;
110     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
111     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
112       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
113     }
114   }
115 }
116 
117 template <int NCOMP, int P1d>
118 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
119   if (data.tidx<P1d)
120   {
121     const CeedInt node = data.tidx;
122     const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d;
123     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
124       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
125     }
126   }
127 }
128 
129 template <int NCOMP, int Q1d>
130 inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
131   const CeedInt node = data.tidx;
132   const CeedInt ind = node + elem * Q1d;
133   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
134     d_v[ind + nquads * comp] = r_v[comp];
135   }
136 }
137 
138 template <int NCOMP, int Q1d>
139 inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
140   const CeedInt node = data.tidx;
141   const CeedInt ind = node + elem * Q1d;
142   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
143     d_v[ind * NCOMP + comp] = r_v[comp];
144   }
145 }
146 
147 template <int NCOMP, int P1d, int Q1d>
148 inline __device__ void ContractX1d(BackendData& data,
149                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
150   data.slice[data.tidx] = *U;
151   __syncthreads();
152   *V = 0.0;
153   for (CeedInt i = 0; i < P1d; ++i) {
154     *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
155   }
156   __syncthreads();
157 }
158 
159 template <int NCOMP, int P1d, int Q1d>
160 inline __device__ void ContractTransposeX1d(BackendData& data,
161                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
162   data.slice[data.tidx] = *U;
163   __syncthreads();
164   *V = 0.0;
165   for (CeedInt i = 0; i < Q1d; ++i) {
166     *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
167   }
168   __syncthreads();
169 }
170 
171 template <int NCOMP, int P1d, int Q1d>
172 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
173                                 CeedScalar *__restrict__ r_V) {
174   for (CeedInt comp=0; comp<NCOMP; comp++) {
175     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
176   }
177 }
178 
179 template <int NCOMP, int P1d, int Q1d>
180 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
181                                 CeedScalar *__restrict__ r_V) {
182   for (CeedInt comp=0; comp<NCOMP; comp++) {
183     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
184   }
185 }
186 
187 template <int NCOMP, int P1d, int Q1d>
188 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
189                               const CeedScalar *c_B, const CeedScalar *c_G,
190                               CeedScalar *__restrict__ r_V) {
191   for (CeedInt comp=0; comp<NCOMP; comp++) {
192     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
193   }
194 }
195 
196 template <int NCOMP, int P1d, int Q1d>
197 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
198                               const CeedScalar *c_B, const CeedScalar *c_G,
199                               CeedScalar *__restrict__ r_V) {
200   for (CeedInt comp=0; comp<NCOMP; comp++) {
201     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
202   }
203 }
204 
205 //****
206 // 2D
207 template <int NCOMP, int P1d>
208 inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
209   if (data.tidx<P1d && data.tidy<P1d)
210   {
211     const CeedInt node = data.tidx + data.tidy*P1d;
212     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
213     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
214       r_u[comp] = d_u[ind + nnodes * comp];
215     }
216   }
217 }
218 
219 template <int NCOMP, int P1d>
220 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
221   if (data.tidx<P1d && data.tidy<P1d)
222   {
223     const CeedInt node = data.tidx + data.tidy*P1d;
224     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
225     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
226       r_u[comp] = d_u[ind * NCOMP + comp];
227     }
228   }
229 }
230 
231 template <int NCOMP, int Q1d>
232 inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
233   const CeedInt node = data.tidx + data.tidy*Q1d;
234   const CeedInt ind = node + elem * Q1d*Q1d;
235   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
236     r_u[comp] = d_u[ind + nquads * comp];
237   }
238 }
239 
240 template <int NCOMP, int Q1d>
241 inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
242   const CeedInt node = data.tidx + data.tidy*Q1d;
243   const CeedInt ind = node + elem * Q1d*Q1d;
244   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
245     r_u[comp] = d_u[ind * NCOMP + comp];
246   }
247 }
248 
249 template <int NCOMP, int P1d>
250 inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
251   if (data.tidx<P1d && data.tidy<P1d)
252   {
253     const CeedInt node = data.tidx + data.tidy*P1d;
254     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
255     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
256       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
257     }
258   }
259 }
260 
261 template <int NCOMP, int P1d>
262 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
263   if (data.tidx<P1d && data.tidy<P1d)
264   {
265     const CeedInt node = data.tidx + data.tidy*P1d;
266     const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d;
267     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
268       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
269     }
270   }
271 }
272 
273 template <int NCOMP, int Q1d>
274 inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
275   const CeedInt node = data.tidx + data.tidy*Q1d;
276   const CeedInt ind = node + elem * Q1d*Q1d;
277   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
278     d_v[ind + nquads * comp] = r_v[comp];
279   }
280 }
281 
282 template <int NCOMP, int Q1d>
283 inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
284   const CeedInt node = data.tidx + data.tidy*Q1d;
285   const CeedInt ind = node + elem * Q1d*Q1d;
286   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
287     d_v[ind * NCOMP + comp] = r_v[comp];
288   }
289 }
290 
291 template <int NCOMP, int P1d, int Q1d>
292 inline __device__ void ContractX2d(BackendData& data,
293                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
294   data.slice[data.tidx+data.tidy*Q1d] = *U;
295   __syncthreads();
296   *V = 0.0;
297   for (CeedInt i = 0; i < P1d; ++i) {
298     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
299   }
300   __syncthreads();
301 }
302 
303 template <int NCOMP, int P1d, int Q1d>
304 inline __device__ void ContractY2d(BackendData& data,
305                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
306   data.slice[data.tidx+data.tidy*Q1d] = *U;
307   __syncthreads();
308   *V = 0.0;
309   for (CeedInt i = 0; i < P1d; ++i) {
310     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
311   }
312   __syncthreads();
313 }
314 
315 template <int NCOMP, int P1d, int Q1d>
316 inline __device__ void ContractYTranspose2d(BackendData& data,
317                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
318   data.slice[data.tidx+data.tidy*Q1d] = *U;
319   __syncthreads();
320   *V = 0.0;
321   if (data.tidy<P1d) {
322     for (CeedInt i = 0; i < Q1d; ++i) {
323       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
324     }
325   }
326   __syncthreads();
327 }
328 
329 template <int NCOMP, int P1d, int Q1d>
330 inline __device__ void ContractXTranspose2d(BackendData& data,
331                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
332   data.slice[data.tidx+data.tidy*Q1d] = *U;
333   __syncthreads();
334   *V = 0.0;
335   if (data.tidx<P1d) {
336     for (CeedInt i = 0; i < Q1d; ++i) {
337       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
338     }
339   }
340   __syncthreads();
341 }
342 
343 template <int NCOMP, int P1d, int Q1d>
344 inline __device__ void ContractXTransposeAdd2d(BackendData& data,
345                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
346   data.slice[data.tidx+data.tidy*Q1d] = *U;
347   __syncthreads();
348   if (data.tidx<P1d) {
349     for (CeedInt i = 0; i < Q1d; ++i) {
350       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
351     }
352   }
353   __syncthreads();
354 }
355 
356 template <int NCOMP, int P1d, int Q1d>
357 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
358                                 CeedScalar *__restrict__ r_V) {
359   CeedScalar r_t[1];
360   for (CeedInt comp=0; comp<NCOMP; comp++) {
361     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
362     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
363   }
364 }
365 
366 template <int NCOMP, int P1d, int Q1d>
367 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
368                                 CeedScalar *__restrict__ r_V) {
369   CeedScalar r_t[1];
370   for (CeedInt comp=0; comp<NCOMP; comp++) {
371     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
372     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
373   }
374 }
375 
376 template <int NCOMP, int P1d, int Q1d>
377 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
378                               const CeedScalar *c_B, const CeedScalar *c_G,
379                               CeedScalar *__restrict__ r_V) {
380   CeedScalar r_t[1];
381   for (CeedInt comp=0; comp<NCOMP; comp++) {
382     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
383     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
384     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
385     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
386   }
387 }
388 
389 template <int NCOMP, int P1d, int Q1d>
390 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
391                               const CeedScalar *c_B, const CeedScalar *c_G,
392                               CeedScalar *__restrict__ r_V) {
393   CeedScalar r_t[1];
394   for (CeedInt comp=0; comp<NCOMP; comp++) {
395     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
396     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
397     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
398     ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
399   }
400 }
401 
402 //****
403 // 3D
404 template <int NCOMP, int P1d>
405 inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
406   if (data.tidx<P1d && data.tidy<P1d) {
407     for (CeedInt z = 0; z < P1d; ++z) {
408       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
409       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
410       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
411         r_u[z+comp*P1d] = d_u[ind + nnodes * comp];
412       }
413     }
414   }
415 }
416 
417 template <int NCOMP, int P1d>
418 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
419   if (data.tidx<P1d && data.tidy<P1d) {
420     for (CeedInt z = 0; z < P1d; ++z) {
421       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
422       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
423       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
424         r_u[z+comp*P1d] = d_u[ind * NCOMP + comp];
425       }
426     }
427   }
428 }
429 
430 template <int NCOMP, int Q1d>
431 inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
432   for (CeedInt z=0; z < Q1d; ++z) {
433     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
434     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
435     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
436       r_u[z+comp*Q1d] = d_u[ind + nquads * comp];
437     }
438   }
439 }
440 
441 template <int NCOMP, int Q1d>
442 inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
443   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
444   const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
445   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
446     r_u[comp] = d_u[ind + nquads * comp];
447   }
448 }
449 
450 template <int NCOMP, int Q1d>
451 inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
452   for (CeedInt z=0; z < Q1d; ++z) {
453     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
454     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
455     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
456       r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp];
457     }
458   }
459 }
460 
461 template <int NCOMP, int Q1d>
462 inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
463   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
464   const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
465   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
466     r_u[comp] = d_u[ind * NCOMP + comp];
467   }
468 }
469 
470 template <int NCOMP, int P1d>
471 inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
472   if (data.tidx<P1d && data.tidy<P1d) {
473     for (CeedInt z = 0; z < P1d; ++z) {
474       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
475       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
476       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
477         atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]);
478       }
479     }
480   }
481 }
482 
483 template <int NCOMP, int P1d>
484 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
485   if (data.tidx<P1d && data.tidy<P1d) {
486     for (CeedInt z = 0; z < P1d; ++z) {
487       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
488       const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d;
489       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
490         atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]);
491       }
492     }
493   }
494 }
495 
496 template <int NCOMP, int Q1d>
497 inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
498   for (CeedInt z=0; z < Q1d; ++z) {
499     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
500     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
501     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
502       d_v[ind + nquads * comp] = r_v[z+comp*Q1d];
503     }
504   }
505 }
506 
507 template <int NCOMP, int Q1d>
508 inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
509   for (CeedInt z=0; z < Q1d; ++z) {
510     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
511     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
512     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
513       d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d];
514     }
515   }
516 }
517 
518 template <int NCOMP, int P1d, int Q1d>
519 inline __device__ void ContractX3d(BackendData& data,
520                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
521   CeedScalar r_B[P1d];
522   for (CeedInt i = 0; i < P1d; ++i) {
523     r_B[i] = B[i + data.tidx*P1d];
524   }
525   for (CeedInt k = 0; k < P1d; ++k) {
526     data.slice[data.tidx+data.tidy*Q1d] = U[k];
527     __syncthreads();
528     V[k] = 0.0;
529     for (CeedInt i = 0; i < P1d; ++i) {
530       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
531     }
532     __syncthreads();
533   }
534 }
535 
536 template <int NCOMP, int P1d, int Q1d>
537 inline __device__ void ContractY3d(BackendData& data,
538                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
539   CeedScalar r_B[P1d];
540   for (CeedInt i = 0; i < P1d; ++i) {
541     r_B[i] = B[i + data.tidy*P1d];
542   }
543   for (CeedInt k = 0; k < P1d; ++k) {
544     data.slice[data.tidx+data.tidy*Q1d] = U[k];
545     __syncthreads();
546     V[k] = 0.0;
547     for (CeedInt i = 0; i < P1d; ++i) {
548       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
549     }
550     __syncthreads();
551   }
552 }
553 
554 template <int NCOMP, int P1d, int Q1d>
555 inline __device__ void ContractZ3d(BackendData& data,
556                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
557   for (CeedInt k = 0; k < Q1d; ++k) {
558     V[k] = 0.0;
559     for (CeedInt i = 0; i < P1d; ++i) {
560       V[k] += B[i + k*P1d] * U[i];//contract z direction
561     }
562   }
563 }
564 
565 template <int NCOMP, int P1d, int Q1d>
566 inline __device__ void ContractTransposeZ3d(BackendData& data,
567     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
568   for (CeedInt k = 0; k < Q1d; ++k) {
569     V[k] = 0.0;
570     if (k<P1d) {
571       for (CeedInt i = 0; i < Q1d; ++i) {
572         V[k] += B[k + i*P1d] * U[i];//contract z direction
573       }
574     }
575   }
576 }
577 
578 template <int NCOMP, int P1d, int Q1d>
579 inline __device__ void ContractTransposeY3d(BackendData& data,
580     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
581   CeedScalar r_B[Q1d];
582   for (CeedInt i = 0; i < Q1d; ++i)
583   {
584     r_B[i] = B[data.tidy + i*P1d];
585   }
586   for (CeedInt k = 0; k < P1d; ++k) {
587     data.slice[data.tidx+data.tidy*Q1d] = U[k];
588     __syncthreads();
589     V[k] = 0.0;
590     if (data.tidy<P1d) {
591       for (CeedInt i = 0; i < Q1d; ++i) {
592         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
593       }
594     }
595     __syncthreads();
596   }
597 }
598 
599 template <int NCOMP, int P1d, int Q1d>
600 inline __device__ void ContractTransposeAddY3d(BackendData& data,
601     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
602   CeedScalar r_B[Q1d];
603   for (CeedInt i = 0; i < Q1d; ++i) {
604     r_B[i] = B[data.tidy + i*P1d];
605   }
606   for (CeedInt k = 0; k < P1d; ++k) {
607     data.slice[data.tidx+data.tidy*Q1d] = U[k];
608     __syncthreads();
609     if (data.tidy<P1d) {
610       for (CeedInt i = 0; i < Q1d; ++i) {
611         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
612       }
613     }
614     __syncthreads();
615   }
616 }
617 
618 template <int NCOMP, int P1d, int Q1d>
619 inline __device__ void ContractTransposeX3d(BackendData& data,
620     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
621   CeedScalar r_B[Q1d];
622   for (CeedInt i = 0; i < Q1d; ++i) {
623     r_B[i] = B[data.tidx + i*P1d];
624   }
625   for (CeedInt k = 0; k < P1d; ++k) {
626     data.slice[data.tidx+data.tidy*Q1d] = U[k];
627     __syncthreads();
628     V[k] = 0.0;
629     if (data.tidx<P1d) {
630       for (CeedInt i = 0; i < Q1d; ++i) {
631         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
632       }
633     }
634     __syncthreads();
635   }
636 }
637 
638 template <int NCOMP, int P1d, int Q1d>
639 inline __device__ void ContractTransposeAddX3d(BackendData& data,
640     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
641   CeedScalar r_B[Q1d];
642   for (CeedInt i = 0; i < Q1d; ++i) {
643     r_B[i] = B[data.tidx + i*P1d];
644   }
645   for (CeedInt k = 0; k < P1d; ++k) {
646     data.slice[data.tidx+data.tidy*Q1d] = U[k];
647     __syncthreads();
648     if (data.tidx<P1d) {
649       for (CeedInt i = 0; i < Q1d; ++i) {
650         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
651       }
652     }
653     __syncthreads();
654   }
655 }
656 
657 template <int NCOMP, int P1d, int Q1d>
658 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
659                                 CeedScalar *__restrict__ r_V) {
660   CeedScalar r_t1[Q1d];
661   CeedScalar r_t2[Q1d];
662   for (CeedInt comp=0; comp<NCOMP; comp++) {
663     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
664     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
665     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
666   }
667 }
668 
669 template <int NCOMP, int P1d, int Q1d>
670 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
671                                 CeedScalar *__restrict__ r_V) {
672   CeedScalar r_t1[Q1d];
673   CeedScalar r_t2[Q1d];
674   for (CeedInt comp=0; comp<NCOMP; comp++) {
675     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
676     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
677     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
678   }
679 }
680 
681 template <int NCOMP, int P1d, int Q1d>
682 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
683                               const CeedScalar *c_B, const CeedScalar *c_G,
684                               CeedScalar *__restrict__ r_V) {
685   CeedScalar r_t1[Q1d];
686   CeedScalar r_t2[Q1d];
687   for (CeedInt comp=0; comp<NCOMP; comp++) {
688     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
689     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
690     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
691     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
692     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
693     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
694     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
695     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
696     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
697   }
698 }
699 
700 template <int NCOMP, int P1d, int Q1d>
701 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
702                                        const CeedScalar *c_B, const CeedScalar *c_G,
703                                        CeedScalar *__restrict__ r_V) {
704   CeedScalar r_t1[Q1d];
705   CeedScalar r_t2[Q1d];
706   for (CeedInt comp=0; comp<NCOMP; comp++) {
707     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
708     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
709     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
710     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
711     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
712     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
713     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
714     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
715     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
716   }
717 }
718 
719 template <int NCOMP, int Q1d>
720 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q,
721                                    const CeedScalar *__restrict__ r_U,
722                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
723   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
724     data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d];
725     __syncthreads();
726     // X derivative
727     r_V[comp+0*NCOMP] = 0.0;
728     for (CeedInt i = 0; i < Q1d; ++i) {
729       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
730     }
731     // Y derivative
732     r_V[comp+1*NCOMP] = 0.0;
733     for (CeedInt i = 0; i < Q1d; ++i) {
734       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
735     }
736     // Z derivative
737     r_V[comp+2*NCOMP] = 0.0;
738     for (CeedInt i = 0; i < Q1d; ++i) {
739       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative)
740     }
741     __syncthreads();
742   }
743 }
744 
745 template <int NCOMP, int Q1d>
746 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q,
747                                             const CeedScalar *__restrict__ r_U,
748                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
749   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
750     // X derivative
751     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP];
752     __syncthreads();
753     for (CeedInt i = 0; i < Q1d; ++i) {
754       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
755     }
756     __syncthreads();
757     // Y derivative
758     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP];
759     __syncthreads();
760     for (CeedInt i = 0; i < Q1d; ++i) {
761       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
762     }
763     __syncthreads();
764     // Z derivative
765     for (CeedInt i = 0; i < Q1d; ++i) {
766       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative)
767     }
768   }
769 }
770 
771 template <int Q1d>
772 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
773   *w = qweight1d[data.tidx];
774 }
775 
776 template <int Q1d>
777 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
778   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
779 }
780 
781 template <int Q1d>
782 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
783   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
784   for (CeedInt z = 0; z < Q1d; ++z)
785   {
786     w[z] = pw*qweight1d[z];
787   }
788 }
789 
790 );
791 
792 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
793 
794 	using std::ostringstream;
795   using std::string;
796   int ierr;
797   bool setupdone;
798   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
799   if (setupdone) return 0;
800   Ceed ceed;
801   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
802   CeedOperator_Cuda_gen *data;
803   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
804   CeedQFunction qf;
805   CeedQFunction_Cuda_gen *qf_data;
806   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
807   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
808   CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes;
809   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
810   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
811   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
812   CeedChk(ierr);
813   CeedOperatorField *opinputfields, *opoutputfields;
814   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
815   CeedChk(ierr);
816   CeedQFunctionField *qfinputfields, *qfoutputfields;
817   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
818   CeedChk(ierr);
819   CeedEvalMode emode;
820   CeedTransposeMode lmode;
821   CeedBasis basis;
822   CeedBasis_Cuda_shared *basis_data;
823   CeedElemRestriction Erestrict;
824   CeedElemRestriction_Cuda_reg *restr_data;
825 
826   ostringstream code;
827   string devFunctions(deviceFunctions);
828 
829   // Add atomicAdd function for old NVidia architectures
830   struct cudaDeviceProp prop;
831   Ceed delegate;
832   CeedGetDelegate(ceed, &delegate);
833   Ceed_Cuda *ceed_data;
834   ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr);
835   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
836   if(prop.major<6){
837     code << atomicAdd;
838   }
839 
840   code << devFunctions;
841 
842   string qFunction(qf_data->qFunctionSource);
843 
844   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
845   code << "\n#define CeedPragmaSIMD\n";
846   code << qFunction;
847 
848   // Setup
849   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
850   // Input Evecs and Restriction
851   for (CeedInt i = 0; i < numinputfields; i++) {
852     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
853     CeedChk(ierr);
854     if (emode == CEED_EVAL_WEIGHT) { // Skip
855     } else {
856       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
857       if (emode != CEED_EVAL_NONE)
858       {
859         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
860         bool isTensor;
861         ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
862         //TODO check that all are the same
863         ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
864         if (isTensor)
865         {
866           //TODO check that all are the same
867           ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
868         } else {
869           return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
870         }
871       }
872     }
873   }
874   data->dim = dim;
875   data->Q1d = Q1d;
876 
877   for (CeedInt i = 0; i < numoutputfields; i++) {
878     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
879   }
880   code << "const CeedInt Dim = "<<dim<<";\n";
881   code << "const CeedInt Q1d = "<<Q1d<<";\n";
882   // code << "const CeedInt Q   = "<<Q<<";\n";
883   code << "extern __shared__ CeedScalar slice[];\n";
884   code << "BackendData data;\n";
885   code << "data.tidx = threadIdx.x;\n";
886   code << "data.tidy = threadIdx.y;\n";
887   code << "data.tidz = threadIdx.z;\n";
888   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
889   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
890   //Initialize constants, and matrices B and G
891   for (CeedInt i = 0; i < numinputfields; i++) {
892     code << "// Input field "<<i<<"\n";
893     // Get elemsize, emode, ncomp
894     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
895     CeedChk(ierr);
896     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
897     CeedChk(ierr);
898     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
899     CeedChk(ierr);
900     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
901     CeedChk(ierr);
902     // Basis action
903     switch (emode) {
904     case CEED_EVAL_NONE:
905       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
906       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
907       code << "  const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n";
908       break;
909     case CEED_EVAL_INTERP:
910       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
911       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
912       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
913       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
914       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
915       code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
916       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
917       data->B.in[i] = basis_data->d_interp1d;
918       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
919       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
920       break;
921     case CEED_EVAL_GRAD:
922       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
923       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
924       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
925       code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
926       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
927       code << "const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
928       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
929       data->B.in[i] = basis_data->d_interp1d;
930       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
931       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
932       if (basis_data->d_collograd1d) {
933         data->G.in[i] = basis_data->d_collograd1d;
934         code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
935         code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
936       } else {
937         data->G.in[i] = basis_data->d_grad1d;
938         code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
939         code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
940       }
941       break;
942     case CEED_EVAL_WEIGHT:
943       break; // No action
944     case CEED_EVAL_DIV:
945       break; // TODO: Not implemented
946     case CEED_EVAL_CURL:
947       break; // TODO: Not implemented
948     }
949   }
950   for (CeedInt i = 0; i < numoutputfields; i++) {
951     code << "// Output field "<<i<<"\n";
952     // Get elemsize, emode, ncomp
953     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
954     CeedChk(ierr);
955     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
956     CeedChk(ierr);
957     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
958     CeedChk(ierr);
959     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
960     CeedChk(ierr);
961     // Basis action
962     switch (emode) {
963     case CEED_EVAL_NONE:
964       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
965       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
966       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
967       code << "  const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n";
968       break; // No action
969     case CEED_EVAL_INTERP:
970       code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
971       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
972       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
973       code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
974       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
975       data->B.out[i] = basis_data->d_interp1d;
976       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
977       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
978       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
979       code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
980       break;
981     case CEED_EVAL_GRAD:
982       code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
983       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
984       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
985       code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
986       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
987       data->B.out[i] = basis_data->d_interp1d;
988       code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
989       code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
990       if (basis_data->d_collograd1d) {
991         data->G.out[i] = basis_data->d_collograd1d;
992         code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
993         code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
994       } else {
995         data->G.out[i] = basis_data->d_grad1d;
996         code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
997         code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
998       }
999       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
1000       code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
1001       break;
1002     case CEED_EVAL_WEIGHT: {
1003       Ceed ceed;
1004       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1005       return CeedError(ceed, 1,
1006                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1007       break; // Should not occur
1008     }
1009     case CEED_EVAL_DIV:
1010       break; // TODO: Not implemented
1011     case CEED_EVAL_CURL:
1012       break; // TODO: Not implemented
1013     }
1014   }
1015   code << "\n";
1016   code << "__syncthreads();\n";
1017   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1018   // Input basis apply if needed
1019   // Generate the correct eval mode code for each input
1020   for (CeedInt i = 0; i < numinputfields; i++) {
1021     code << "  // Input field "<<i<<"\n";
1022     // Get elemsize, emode, ncomp
1023     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1024     CeedChk(ierr);
1025     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1026     CeedChk(ierr);
1027     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1028     CeedChk(ierr);
1029     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1030     CeedChk(ierr);
1031     // Basis action
1032     switch (emode) {
1033     case CEED_EVAL_NONE:
1034       if (!basis_data->d_collograd1d) {
1035         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1036         ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
1037         code << "  readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n";
1038       }
1039       break;
1040     case CEED_EVAL_INTERP:
1041       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1042       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
1043       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1044       data->indices.in[i] = restr_data->d_ind;
1045       code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1046       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1047       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1048       break;
1049     case CEED_EVAL_GRAD:
1050       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1051       ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
1052       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1053       data->indices.in[i] = restr_data->d_ind;
1054       code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1055       if (basis_data->d_collograd1d) {
1056         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1057         code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1058       } else {
1059         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1060         code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1061       }
1062       break;
1063     case CEED_EVAL_WEIGHT:
1064       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
1065       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1066       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1067       data->W = basis_data->d_qweight1d;
1068       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1069       break; // No action
1070     case CEED_EVAL_DIV:
1071       break; // TODO: Not implemented
1072     case CEED_EVAL_CURL:
1073       break; // TODO: Not implemented
1074     }
1075   }
1076 
1077   // Q function
1078   for (CeedInt i = 0; i < numoutputfields; i++) {
1079       code << "  // Output field "<<i<<"\n";
1080     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1081     CeedChk(ierr);
1082     if (emode==CEED_EVAL_GRAD)
1083     {
1084       if (basis_data->d_collograd1d) {
1085         //Accumulator for gradient slices
1086         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1087         code << "  for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1088         code << "    for (CeedInt j = 0; j < Q1d; ++j) {\n";
1089         code << "      r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1090         code << "    }\n";
1091         code << "  }\n";
1092       } else {
1093         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1094       }
1095     }
1096     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1097     {
1098       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1099     }
1100   }
1101   //We treat quadrature points per slice in 3d to save registers
1102   code << "// QFunction\n";
1103   if (basis_data->d_collograd1d) {
1104     code << "#pragma unroll\n";
1105     code << "for (CeedInt q=0; q<Q1d; q++) {\n";
1106     for (CeedInt i = 0; i < numinputfields; i++) {
1107       code << "  // Input field "<<i<<"\n";
1108       // Get elemsize, emode, ncomp
1109       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1110       CeedChk(ierr);
1111       // Basis action
1112       switch (emode) {
1113       case CEED_EVAL_NONE:
1114         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1115         ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
1116         code << "  readSliceQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1117         break;
1118       case CEED_EVAL_INTERP:
1119         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1120         code << "  for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1121         code << "    r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1122         code << "  }\n";
1123         break;
1124       case CEED_EVAL_GRAD:
1125         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1126         code << "  gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1127         break;
1128       case CEED_EVAL_WEIGHT:
1129         code << "  CeedScalar r_q"<<i<<"[1];\n";
1130         code << "  r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1131         break; // No action
1132       case CEED_EVAL_DIV:
1133         break; // TODO: Not implemented
1134       case CEED_EVAL_CURL:
1135         break; // TODO: Not implemented
1136       }
1137     }
1138     for (CeedInt i = 0; i < numoutputfields; i++) {
1139       code << "  // Output field "<<i<<"\n";
1140       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1141       CeedChk(ierr);
1142       // Basis action
1143       switch (emode) {
1144       case CEED_EVAL_NONE:
1145         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1146         break; // No action
1147       case CEED_EVAL_INTERP:
1148         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1149         break;
1150       case CEED_EVAL_GRAD:
1151         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1152         break;
1153       case CEED_EVAL_WEIGHT:
1154         break; // Should not occur
1155       case CEED_EVAL_DIV:
1156         break; // TODO: Not implemented
1157       case CEED_EVAL_CURL:
1158         break; // TODO: Not implemented
1159       }
1160     }
1161   } else {
1162     for (CeedInt i = 0; i < numinputfields; i++) {
1163       code << "  CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1164     }
1165     for (CeedInt i = 0; i < numoutputfields; i++) {
1166       code << "  CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1167     }
1168   }
1169   code << "  CeedScalar* in["<<numinputfields<<"];\n";
1170   for (CeedInt i = 0; i < numinputfields; i++) {
1171     code << "  in["<<i<<"] = r_q"<<i<<";\n";
1172   }
1173   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
1174   for (CeedInt i = 0; i < numoutputfields; i++) {
1175     code << "  out["<<i<<"] = r_qq"<<i<<";\n";
1176   }
1177   string qFunctionName(qf_data->qFunctionName);
1178   code << "  "<<qFunctionName<<"(ctx, ";
1179   if(dim!=3 || basis_data->d_collograd1d) {
1180     code << "1 ";
1181   }else{
1182     code << "Q1d ";
1183   }
1184   code << ", in, out);\n";
1185   if (basis_data->d_collograd1d) {
1186     for (CeedInt i = 0; i < numoutputfields; i++) {
1187       code << "  // Output field "<<i<<"\n";
1188       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1189       CeedChk(ierr);
1190       // Basis action
1191       switch (emode) {
1192       case CEED_EVAL_NONE:
1193         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1194         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1195         code << "  }\n";
1196         break; // No action
1197       case CEED_EVAL_INTERP:
1198         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1199         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1200         code << "  }\n";
1201         break;
1202       case CEED_EVAL_GRAD:
1203         code << "  gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1204         break;
1205       case CEED_EVAL_WEIGHT:
1206         break; // Should not occur
1207       case CEED_EVAL_DIV:
1208         break; // TODO: Not implemented
1209       case CEED_EVAL_CURL:
1210         break; // TODO: Not implemented
1211       }
1212     }
1213     code << "}\n";
1214   }
1215 
1216   // Output basis apply if needed
1217   // Generate the correct eval mode code for each output
1218   for (CeedInt i = 0; i < numoutputfields; i++) {
1219     code << "  // Output field "<<i<<"\n";
1220     // Get elemsize, emode, ncomp
1221     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1222     CeedChk(ierr);
1223     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1224     CeedChk(ierr);
1225     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1226     CeedChk(ierr);
1227     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1228     CeedChk(ierr);
1229     // Basis action
1230     switch (emode) {
1231     case CEED_EVAL_NONE:
1232       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
1233       code << "  writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n";
1234       break; // No action
1235     case CEED_EVAL_INTERP:
1236       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1237       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1238       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
1239       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1240       data->indices.out[i] = restr_data->d_ind;
1241       code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1242       break;
1243     case CEED_EVAL_GRAD:
1244       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1245       if (basis_data->d_collograd1d) {
1246         code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1247       } else {
1248         code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1249       }
1250       ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
1251       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1252       data->indices.out[i] = restr_data->d_ind;
1253       code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1254       break;
1255     case CEED_EVAL_WEIGHT: {
1256       Ceed ceed;
1257       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1258       return CeedError(ceed, 1,
1259                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1260       break; // Should not occur
1261     }
1262     case CEED_EVAL_DIV:
1263       break; // TODO: Not implemented
1264     case CEED_EVAL_CURL:
1265       break; // TODO: Not implemented
1266     }
1267   }
1268 
1269   code << "  }\n";
1270   code << "}\n\n";
1271 
1272   // std::cout << code.str();
1273 
1274   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr);
1275   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1276   CeedChk(ierr);
1277 
1278   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1279 
1280   return 0;
1281 }
1282