xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision a81eb9234e2e84f4da89341343356ae857465fa7)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 #include <ceed-backend.h>
17 #include "ceed-cuda-gen.h"
18 #include <iostream>
19 #include <sstream>
20 #include "../cuda/ceed-cuda.h"
21 #include "../cuda-reg/ceed-cuda-reg.h"
22 #include "../cuda-shared/ceed-cuda-shared.h"
23 
24 static const char *atomicAdd = QUOTE(
25 __device__ double atomicAdd(double *address, double val) {
26   unsigned long long int *address_as_ull = (unsigned long long int *)address;
27   unsigned long long int old = *address_as_ull, assumed;
28   do {
29     assumed = old;
30     old =
31       atomicCAS(address_as_ull, assumed,
32                 __double_as_longlong(val +
33                                      __longlong_as_double(assumed)));
34     // Note: uses integer comparison to avoid hang in case of NaN
35     // (since NaN != NaN)
36   } while (assumed != old);
37   return __longlong_as_double(old);
38 }
39 );
40 
41 static const char *deviceFunctions = QUOTE(
42 
43 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
44 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
45 
46 typedef struct {
47   CeedInt tidx;
48   CeedInt tidy;
49   CeedInt tidz;
50   CeedInt tid;
51   CeedScalar* slice;
52 } BackendData;
53 
54 template <int P, int Q>
55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
56   for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
57     B[i] = d_B[i];
58   }
59 }
60 
61 //****
62 // 1D
63 template <int NCOMP, int P1d>
64 inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
65   if (data.tidx<P1d)
66   {
67     const CeedInt node = data.tidx;
68     const CeedInt ind = indices[node + elem * P1d];
69     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
70       r_u[comp] = d_u[ind + nnodes * comp];
71     }
72   }
73 }
74 
75 template <int NCOMP, int P1d>
76 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
77   if (data.tidx<P1d)
78   {
79     const CeedInt node = data.tidx;
80     const CeedInt ind = indices[node + elem * P1d];
81     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
82       r_u[comp] = d_u[ind * NCOMP + comp];
83     }
84   }
85 }
86 
87 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
88 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
89   if (data.tidx<P1d)
90   {
91     const CeedInt node = data.tidx;
92     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
93     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
94       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
95     }
96   }
97 }
98 
99 template <int NCOMP, int P1d>
100 inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
101   if (data.tidx<P1d)
102   {
103     const CeedInt node = data.tidx;
104     const CeedInt ind = indices[node + elem * P1d];
105     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
106       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
107     }
108   }
109 }
110 
111 template <int NCOMP, int P1d>
112 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
113   if (data.tidx<P1d)
114   {
115     const CeedInt node = data.tidx;
116     const CeedInt ind = indices[node + elem * P1d];
117     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
118       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
119     }
120   }
121 }
122 
123 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
124 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
125   if (data.tidx<P1d)
126   {
127     const CeedInt node = data.tidx;
128     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
129     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
130       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
131     }
132   }
133 }
134 
135 template <int NCOMP, int P1d, int Q1d>
136 inline __device__ void ContractX1d(BackendData& data,
137                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
138   data.slice[data.tidx] = *U;
139   __syncthreads();
140   *V = 0.0;
141   for (CeedInt i = 0; i < P1d; ++i) {
142     *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
143   }
144   __syncthreads();
145 }
146 
147 template <int NCOMP, int P1d, int Q1d>
148 inline __device__ void ContractTransposeX1d(BackendData& data,
149                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
150   data.slice[data.tidx] = *U;
151   __syncthreads();
152   *V = 0.0;
153   for (CeedInt i = 0; i < Q1d; ++i) {
154     *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
155   }
156   __syncthreads();
157 }
158 
159 template <int NCOMP, int P1d, int Q1d>
160 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
161                                 CeedScalar *__restrict__ r_V) {
162   for (CeedInt comp=0; comp<NCOMP; comp++) {
163     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
164   }
165 }
166 
167 template <int NCOMP, int P1d, int Q1d>
168 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
169                                 CeedScalar *__restrict__ r_V) {
170   for (CeedInt comp=0; comp<NCOMP; comp++) {
171     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
172   }
173 }
174 
175 template <int NCOMP, int P1d, int Q1d>
176 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
177                               const CeedScalar *c_B, const CeedScalar *c_G,
178                               CeedScalar *__restrict__ r_V) {
179   for (CeedInt comp=0; comp<NCOMP; comp++) {
180     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
181   }
182 }
183 
184 template <int NCOMP, int P1d, int Q1d>
185 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
186                               const CeedScalar *c_B, const CeedScalar *c_G,
187                               CeedScalar *__restrict__ r_V) {
188   for (CeedInt comp=0; comp<NCOMP; comp++) {
189     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
190   }
191 }
192 
193 //****
194 // 2D
195 template <int NCOMP, int P1d>
196 inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
197   if (data.tidx<P1d && data.tidy<P1d)
198   {
199     const CeedInt node = data.tidx + data.tidy*P1d;
200     const CeedInt ind = indices[node + elem * P1d*P1d];
201     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
202       r_u[comp] = d_u[ind + nnodes * comp];
203     }
204   }
205 }
206 
207 template <int NCOMP, int P1d>
208 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
209   if (data.tidx<P1d && data.tidy<P1d)
210   {
211     const CeedInt node = data.tidx + data.tidy*P1d;
212     const CeedInt ind = indices[node + elem * P1d*P1d];
213     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
214       r_u[comp] = d_u[ind * NCOMP + comp];
215     }
216   }
217 }
218 
219 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
220 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
221   if (data.tidx<P1d && data.tidy<P1d)
222   {
223     const CeedInt node = data.tidx + data.tidy*P1d;
224     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
225     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
226       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
227     }
228   }
229 }
230 
231 template <int NCOMP, int P1d>
232 inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
233   if (data.tidx<P1d && data.tidy<P1d)
234   {
235     const CeedInt node = data.tidx + data.tidy*P1d;
236     const CeedInt ind = indices[node + elem * P1d*P1d];
237     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
238       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
239     }
240   }
241 }
242 
243 template <int NCOMP, int P1d>
244 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
245   if (data.tidx<P1d && data.tidy<P1d)
246   {
247     const CeedInt node = data.tidx + data.tidy*P1d;
248     const CeedInt ind = indices[node + elem * P1d*P1d];
249     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
250       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
251     }
252   }
253 }
254 
255 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
256 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
257   if (data.tidx<P1d && data.tidy<P1d)
258   {
259     const CeedInt node = data.tidx + data.tidy*P1d;
260     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
261     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
262       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
263     }
264   }
265 }
266 
267 template <int NCOMP, int P1d, int Q1d>
268 inline __device__ void ContractX2d(BackendData& data,
269                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
270   data.slice[data.tidx+data.tidy*Q1d] = *U;
271   __syncthreads();
272   *V = 0.0;
273   for (CeedInt i = 0; i < P1d; ++i) {
274     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
275   }
276   __syncthreads();
277 }
278 
279 template <int NCOMP, int P1d, int Q1d>
280 inline __device__ void ContractY2d(BackendData& data,
281                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
282   data.slice[data.tidx+data.tidy*Q1d] = *U;
283   __syncthreads();
284   *V = 0.0;
285   for (CeedInt i = 0; i < P1d; ++i) {
286     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
287   }
288   __syncthreads();
289 }
290 
291 template <int NCOMP, int P1d, int Q1d>
292 inline __device__ void ContractYTranspose2d(BackendData& data,
293                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
294   data.slice[data.tidx+data.tidy*Q1d] = *U;
295   __syncthreads();
296   *V = 0.0;
297   if (data.tidy<P1d) {
298     for (CeedInt i = 0; i < Q1d; ++i) {
299       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
300     }
301   }
302   __syncthreads();
303 }
304 
305 template <int NCOMP, int P1d, int Q1d>
306 inline __device__ void ContractXTranspose2d(BackendData& data,
307                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
308   data.slice[data.tidx+data.tidy*Q1d] = *U;
309   __syncthreads();
310   *V = 0.0;
311   if (data.tidx<P1d) {
312     for (CeedInt i = 0; i < Q1d; ++i) {
313       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
314     }
315   }
316   __syncthreads();
317 }
318 
319 template <int NCOMP, int P1d, int Q1d>
320 inline __device__ void ContractXTransposeAdd2d(BackendData& data,
321                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322   data.slice[data.tidx+data.tidy*Q1d] = *U;
323   __syncthreads();
324   if (data.tidx<P1d) {
325     for (CeedInt i = 0; i < Q1d; ++i) {
326       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
327     }
328   }
329   __syncthreads();
330 }
331 
332 template <int NCOMP, int P1d, int Q1d>
333 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
334                                 CeedScalar *__restrict__ r_V) {
335   CeedScalar r_t[1];
336   for (CeedInt comp=0; comp<NCOMP; comp++) {
337     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
338     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
339   }
340 }
341 
342 template <int NCOMP, int P1d, int Q1d>
343 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
344                                 CeedScalar *__restrict__ r_V) {
345   CeedScalar r_t[1];
346   for (CeedInt comp=0; comp<NCOMP; comp++) {
347     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
348     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
349   }
350 }
351 
352 template <int NCOMP, int P1d, int Q1d>
353 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
354                               const CeedScalar *c_B, const CeedScalar *c_G,
355                               CeedScalar *__restrict__ r_V) {
356   CeedScalar r_t[1];
357   for (CeedInt comp=0; comp<NCOMP; comp++) {
358     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
359     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
360     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
361     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
362   }
363 }
364 
365 template <int NCOMP, int P1d, int Q1d>
366 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
367                               const CeedScalar *c_B, const CeedScalar *c_G,
368                               CeedScalar *__restrict__ r_V) {
369   CeedScalar r_t[1];
370   for (CeedInt comp=0; comp<NCOMP; comp++) {
371     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
372     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
373     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
374     ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
375   }
376 }
377 
378 //****
379 // 3D
380 template <int NCOMP, int P1d>
381 inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
382   if (data.tidx<P1d && data.tidy<P1d) {
383     for (CeedInt z = 0; z < P1d; ++z) {
384       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
385       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
386       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
387         r_u[z+comp*P1d] = d_u[ind + nnodes * comp];
388       }
389     }
390   }
391 }
392 
393 template <int NCOMP, int P1d>
394 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
395   if (data.tidx<P1d && data.tidy<P1d) {
396     for (CeedInt z = 0; z < P1d; ++z) {
397       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
398       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
399       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
400         r_u[z+comp*P1d] = d_u[ind * NCOMP + comp];
401       }
402     }
403   }
404 }
405 
406 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
407 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
408   if (data.tidx<P1d && data.tidy<P1d) {
409     for (CeedInt z = 0; z < P1d; ++z) {
410       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
411       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
412       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
413         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
414       }
415     }
416   }
417 }
418 
419 template <int NCOMP, int Q1d>
420 inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
421   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
422   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
423   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
424     r_u[comp] = d_u[ind + nquads * comp];
425   }
426 }
427 
428 template <int NCOMP, int Q1d>
429 inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
430   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
431   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];
432   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
433     r_u[comp] = d_u[ind * NCOMP + comp];
434   }
435 }
436 
437 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
438 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
439   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
440   const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
441   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
442     r_u[comp] = d_u[ind + comp * STRIDES_COMP];
443   }
444 }
445 
446 template <int NCOMP, int P1d>
447 inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
448   if (data.tidx<P1d && data.tidy<P1d) {
449     for (CeedInt z = 0; z < P1d; ++z) {
450       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
451       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
452       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
453         atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]);
454       }
455     }
456   }
457 }
458 
459 template <int NCOMP, int P1d>
460 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
461   if (data.tidx<P1d && data.tidy<P1d) {
462     for (CeedInt z = 0; z < P1d; ++z) {
463       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
464       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
465       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
466         atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]);
467       }
468     }
469   }
470 }
471 
472 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
473 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
474   if (data.tidx<P1d && data.tidy<P1d) {
475     for (CeedInt z = 0; z < P1d; ++z) {
476       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
477       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
478       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
479         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
480       }
481     }
482   }
483 }
484 
485 template <int NCOMP, int P1d, int Q1d>
486 inline __device__ void ContractX3d(BackendData& data,
487                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
488   CeedScalar r_B[P1d];
489   for (CeedInt i = 0; i < P1d; ++i) {
490     r_B[i] = B[i + data.tidx*P1d];
491   }
492   for (CeedInt k = 0; k < P1d; ++k) {
493     data.slice[data.tidx+data.tidy*Q1d] = U[k];
494     __syncthreads();
495     V[k] = 0.0;
496     for (CeedInt i = 0; i < P1d; ++i) {
497       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
498     }
499     __syncthreads();
500   }
501 }
502 
503 template <int NCOMP, int P1d, int Q1d>
504 inline __device__ void ContractY3d(BackendData& data,
505                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
506   CeedScalar r_B[P1d];
507   for (CeedInt i = 0; i < P1d; ++i) {
508     r_B[i] = B[i + data.tidy*P1d];
509   }
510   for (CeedInt k = 0; k < P1d; ++k) {
511     data.slice[data.tidx+data.tidy*Q1d] = U[k];
512     __syncthreads();
513     V[k] = 0.0;
514     for (CeedInt i = 0; i < P1d; ++i) {
515       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
516     }
517     __syncthreads();
518   }
519 }
520 
521 template <int NCOMP, int P1d, int Q1d>
522 inline __device__ void ContractZ3d(BackendData& data,
523                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
524   for (CeedInt k = 0; k < Q1d; ++k) {
525     V[k] = 0.0;
526     for (CeedInt i = 0; i < P1d; ++i) {
527       V[k] += B[i + k*P1d] * U[i];//contract z direction
528     }
529   }
530 }
531 
532 template <int NCOMP, int P1d, int Q1d>
533 inline __device__ void ContractTransposeZ3d(BackendData& data,
534     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
535   for (CeedInt k = 0; k < Q1d; ++k) {
536     V[k] = 0.0;
537     if (k<P1d) {
538       for (CeedInt i = 0; i < Q1d; ++i) {
539         V[k] += B[k + i*P1d] * U[i];//contract z direction
540       }
541     }
542   }
543 }
544 
545 template <int NCOMP, int P1d, int Q1d>
546 inline __device__ void ContractTransposeY3d(BackendData& data,
547     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
548   CeedScalar r_B[Q1d];
549   for (CeedInt i = 0; i < Q1d; ++i)
550   {
551     r_B[i] = B[data.tidy + i*P1d];
552   }
553   for (CeedInt k = 0; k < P1d; ++k) {
554     data.slice[data.tidx+data.tidy*Q1d] = U[k];
555     __syncthreads();
556     V[k] = 0.0;
557     if (data.tidy<P1d) {
558       for (CeedInt i = 0; i < Q1d; ++i) {
559         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
560       }
561     }
562     __syncthreads();
563   }
564 }
565 
566 template <int NCOMP, int P1d, int Q1d>
567 inline __device__ void ContractTransposeAddY3d(BackendData& data,
568     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
569   CeedScalar r_B[Q1d];
570   for (CeedInt i = 0; i < Q1d; ++i) {
571     r_B[i] = B[data.tidy + i*P1d];
572   }
573   for (CeedInt k = 0; k < P1d; ++k) {
574     data.slice[data.tidx+data.tidy*Q1d] = U[k];
575     __syncthreads();
576     if (data.tidy<P1d) {
577       for (CeedInt i = 0; i < Q1d; ++i) {
578         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
579       }
580     }
581     __syncthreads();
582   }
583 }
584 
585 template <int NCOMP, int P1d, int Q1d>
586 inline __device__ void ContractTransposeX3d(BackendData& data,
587     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
588   CeedScalar r_B[Q1d];
589   for (CeedInt i = 0; i < Q1d; ++i) {
590     r_B[i] = B[data.tidx + i*P1d];
591   }
592   for (CeedInt k = 0; k < P1d; ++k) {
593     data.slice[data.tidx+data.tidy*Q1d] = U[k];
594     __syncthreads();
595     V[k] = 0.0;
596     if (data.tidx<P1d) {
597       for (CeedInt i = 0; i < Q1d; ++i) {
598         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
599       }
600     }
601     __syncthreads();
602   }
603 }
604 
605 template <int NCOMP, int P1d, int Q1d>
606 inline __device__ void ContractTransposeAddX3d(BackendData& data,
607     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
608   CeedScalar r_B[Q1d];
609   for (CeedInt i = 0; i < Q1d; ++i) {
610     r_B[i] = B[data.tidx + i*P1d];
611   }
612   for (CeedInt k = 0; k < P1d; ++k) {
613     data.slice[data.tidx+data.tidy*Q1d] = U[k];
614     __syncthreads();
615     if (data.tidx<P1d) {
616       for (CeedInt i = 0; i < Q1d; ++i) {
617         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
618       }
619     }
620     __syncthreads();
621   }
622 }
623 
624 template <int NCOMP, int P1d, int Q1d>
625 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
626                                 CeedScalar *__restrict__ r_V) {
627   CeedScalar r_t1[Q1d];
628   CeedScalar r_t2[Q1d];
629   for (CeedInt comp=0; comp<NCOMP; comp++) {
630     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
631     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
632     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
633   }
634 }
635 
636 template <int NCOMP, int P1d, int Q1d>
637 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
638                                 CeedScalar *__restrict__ r_V) {
639   CeedScalar r_t1[Q1d];
640   CeedScalar r_t2[Q1d];
641   for (CeedInt comp=0; comp<NCOMP; comp++) {
642     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
643     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
644     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
645   }
646 }
647 
648 template <int NCOMP, int P1d, int Q1d>
649 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
650                               const CeedScalar *c_B, const CeedScalar *c_G,
651                               CeedScalar *__restrict__ r_V) {
652   CeedScalar r_t1[Q1d];
653   CeedScalar r_t2[Q1d];
654   for (CeedInt comp=0; comp<NCOMP; comp++) {
655     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
656     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
657     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
658     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
659     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
660     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
661     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
662     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
663     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
664   }
665 }
666 
667 template <int NCOMP, int P1d, int Q1d>
668 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
669                                        const CeedScalar *c_B, const CeedScalar *c_G,
670                                        CeedScalar *__restrict__ r_V) {
671   CeedScalar r_t1[Q1d];
672   CeedScalar r_t2[Q1d];
673   for (CeedInt comp=0; comp<NCOMP; comp++) {
674     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
675     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
676     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
677     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
678     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
679     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
680     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
681     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
682     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
683   }
684 }
685 
686 template <int NCOMP, int Q1d>
687 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q,
688                                    const CeedScalar *__restrict__ r_U,
689                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
690   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
691     data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d];
692     __syncthreads();
693     // X derivative
694     r_V[comp+0*NCOMP] = 0.0;
695     for (CeedInt i = 0; i < Q1d; ++i) {
696       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
697     }
698     // Y derivative
699     r_V[comp+1*NCOMP] = 0.0;
700     for (CeedInt i = 0; i < Q1d; ++i) {
701       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
702     }
703     // Z derivative
704     r_V[comp+2*NCOMP] = 0.0;
705     for (CeedInt i = 0; i < Q1d; ++i) {
706       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative)
707     }
708     __syncthreads();
709   }
710 }
711 
712 template <int NCOMP, int Q1d>
713 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q,
714                                             const CeedScalar *__restrict__ r_U,
715                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
716   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
717     // X derivative
718     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP];
719     __syncthreads();
720     for (CeedInt i = 0; i < Q1d; ++i) {
721       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
722     }
723     __syncthreads();
724     // Y derivative
725     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP];
726     __syncthreads();
727     for (CeedInt i = 0; i < Q1d; ++i) {
728       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
729     }
730     __syncthreads();
731     // Z derivative
732     for (CeedInt i = 0; i < Q1d; ++i) {
733       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative)
734     }
735   }
736 }
737 
738 template <int Q1d>
739 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
740   *w = qweight1d[data.tidx];
741 }
742 
743 template <int Q1d>
744 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
745   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
746 }
747 
748 template <int Q1d>
749 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
750   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
751   for (CeedInt z = 0; z < Q1d; ++z)
752   {
753     w[z] = pw*qweight1d[z];
754   }
755 }
756 
757 );
758 
759 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
760 
761   using std::ostringstream;
762   using std::string;
763   int ierr;
764   bool setupdone;
765   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
766   if (setupdone) return 0;
767   Ceed ceed;
768   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
769   CeedOperator_Cuda_gen *data;
770   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
771   CeedQFunction qf;
772   CeedQFunction_Cuda_gen *qf_data;
773   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
774   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
775   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
776           numoutputfields, ncomp, dim = 0, nnodes;
777   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
778   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
779   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
780   CeedChk(ierr);
781   CeedOperatorField *opinputfields, *opoutputfields;
782   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
783   CeedChk(ierr);
784   CeedQFunctionField *qfinputfields, *qfoutputfields;
785   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
786   CeedChk(ierr);
787   CeedEvalMode emode;
788   CeedInterlaceMode imode;
789   CeedBasis basis;
790   CeedBasis_Cuda_shared *basis_data;
791   CeedElemRestriction Erestrict;
792   CeedElemRestriction_Cuda_reg *restr_data;
793 
794   ostringstream code;
795   string devFunctions(deviceFunctions);
796 
797   // Add atomicAdd function for old NVidia architectures
798   struct cudaDeviceProp prop;
799   Ceed_Cuda *ceed_data;
800   ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr);
801   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
802   if (prop.major<6){
803     code << atomicAdd;
804   }
805 
806   code << devFunctions;
807 
808   string qFunction(qf_data->qFunctionSource);
809 
810   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
811   code << "\n#define CeedPragmaSIMD\n";
812 
813   // Find dim and Q1d
814   bool collograd = false;
815   for (CeedInt i = 0; i < numinputfields; i++) {
816     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
817     if (basis != CEED_BASIS_COLLOCATED) {
818       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
819 
820       // Check for collocated gradient
821       if (basis_data->d_collograd1d)
822         collograd = true;
823 
824       // Collect dim and Q1d
825       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
826       bool isTensor;
827       ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
828       if (isTensor) {
829         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
830       } else {
831         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
832         }
833     }
834   }
835   // Check output bases for Q1d, dim as well
836   //   The only imput basis might be CEED_BASIS_COLLOCATED
837   for (CeedInt i = 0; i < numoutputfields; i++) {
838     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
839     if (basis != CEED_BASIS_COLLOCATED) {
840       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
841       // Collect dim and Q1d
842       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
843       bool isTensor;
844       ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
845       if (isTensor) {
846         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
847       } else {
848         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
849         }
850     }
851   }
852   data->dim = dim;
853   data->Q1d = Q1d;
854 
855   // Define CEED_Q_VLA
856   if (dim != 3 || collograd) {
857     code << "\n#define CEED_Q_VLA 1\n\n";
858   } else {
859     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
860   }
861 
862   code << qFunction;
863 
864   // Setup
865   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
866   // Input Evecs and Restriction
867   for (CeedInt i = 0; i < numinputfields; i++) {
868     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
869     CeedChk(ierr);
870     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
871       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
872     }
873   }
874 
875   for (CeedInt i = 0; i < numoutputfields; i++) {
876     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
877   }
878 
879   code << "const CeedInt Dim = "<<dim<<";\n";
880   code << "const CeedInt Q1d = "<<Q1d<<";\n";
881 
882   code << "extern __shared__ CeedScalar slice[];\n";
883   code << "BackendData data;\n";
884   code << "data.tidx = threadIdx.x;\n";
885   code << "data.tidy = threadIdx.y;\n";
886   code << "data.tidz = threadIdx.z;\n";
887   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
888   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
889 
890   code << "\n// Input field constants and basis data\n";
891   //Initialize constants, and matrices B and G
892   for (CeedInt i = 0; i < numinputfields; i++) {
893     code << "// -- Input field "<<i<<" --\n";
894     // Get elemsize, emode, ncomp
895     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
896     CeedChk(ierr);
897     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
898     CeedChk(ierr);
899     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
900     CeedChk(ierr);
901     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
902     CeedChk(ierr);
903 
904     // Set field constants
905     if (emode != CEED_EVAL_WEIGHT) {
906       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
907       if (basis != CEED_BASIS_COLLOCATED) {
908         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
909         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
910       } else {
911         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
912       }
913       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
914     }
915 
916     // Load basis data
917     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
918     switch (emode) {
919     case CEED_EVAL_NONE:
920       break;
921     case CEED_EVAL_INTERP:
922       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
923       data->B.in[i] = basis_data->d_interp1d;
924       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
925       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
926       break;
927     case CEED_EVAL_GRAD:
928       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
929       data->B.in[i] = basis_data->d_interp1d;
930       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
931       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
932       if (basis_data->d_collograd1d) {
933         data->G.in[i] = basis_data->d_collograd1d;
934         code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
935         code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
936       } else {
937         data->G.in[i] = basis_data->d_grad1d;
938         code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
939         code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
940       }
941       break;
942     case CEED_EVAL_WEIGHT:
943       break; // No action
944     case CEED_EVAL_DIV:
945       break; // TODO: Not implemented
946     case CEED_EVAL_CURL:
947       break; // TODO: Not implemented
948     }
949   }
950 
951   code << "\n// Output field constants and basis data\n";
952   for (CeedInt i = 0; i < numoutputfields; i++) {
953     code << "// -- Output field "<<i<<" --\n";
954     // Get elemsize, emode, ncomp
955     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
956     CeedChk(ierr);
957     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
958     CeedChk(ierr);
959     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
960     CeedChk(ierr);
961     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
962     CeedChk(ierr);
963 
964     // Set field constants
965     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
966     if (basis != CEED_BASIS_COLLOCATED) {
967       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
968       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
969     } else {
970       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
971     }
972     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
973 
974     // Load basis data
975     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
976     switch (emode) {
977     case CEED_EVAL_NONE:
978       break; // No action
979     case CEED_EVAL_INTERP:
980       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
981       data->B.out[i] = basis_data->d_interp1d;
982       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
983       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
984       break;
985     case CEED_EVAL_GRAD:
986       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
987       data->B.out[i] = basis_data->d_interp1d;
988       code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
989       code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
990       if (basis_data->d_collograd1d) {
991         data->G.out[i] = basis_data->d_collograd1d;
992         code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
993         code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
994       } else {
995         data->G.out[i] = basis_data->d_grad1d;
996         code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
997         code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
998       }
999       break;
1000     case CEED_EVAL_WEIGHT: {
1001       Ceed ceed;
1002       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1003       return CeedError(ceed, 1,
1004                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1005       break; // Should not occur
1006     }
1007     case CEED_EVAL_DIV:
1008       break; // TODO: Not implemented
1009     case CEED_EVAL_CURL:
1010       break; // TODO: Not implemented
1011     }
1012   }
1013   code << "\n";
1014   code << "__syncthreads();\n";
1015   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1016   // Input basis apply if needed
1017   // Generate the correct eval mode code for each input
1018   code << "\n// Input field restrictions and basis actions\n";
1019   for (CeedInt i = 0; i < numinputfields; i++) {
1020     code << "  // -- Input field "<<i<<" --\n";
1021     // Get elemsize, emode, ncomp
1022     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1023     CeedChk(ierr);
1024     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1025     CeedChk(ierr);
1026     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1027     CeedChk(ierr);
1028     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1029     CeedChk(ierr);
1030 
1031     // Restriction
1032     if (emode != CEED_EVAL_WEIGHT &&
1033         !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) {
1034       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1035       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1036       data->indices.in[i] = restr_data->d_ind;
1037       if (data->indices.in[i]) {
1038         ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
1039         code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
1040         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1041         code << "  // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n";
1042         code << "  readDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1043       } else {
1044         bool backendstrides;
1045         ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict,
1046                                                           &backendstrides);
1047         CeedChk(ierr);
1048         CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1049         if (!backendstrides) {
1050           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1051           CeedChk(ierr);
1052         }
1053         code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1054         code << "  readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1055       }
1056     }
1057 
1058     // Basis action
1059     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1060     switch (emode) {
1061     case CEED_EVAL_NONE:
1062       if (!basis_data->d_collograd1d) {
1063         code << "  CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1064       }
1065       break;
1066     case CEED_EVAL_INTERP:
1067       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1068       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1069       break;
1070     case CEED_EVAL_GRAD:
1071       if (basis_data->d_collograd1d) {
1072         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1073         code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1074       } else {
1075         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1076         code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1077       }
1078       break;
1079     case CEED_EVAL_WEIGHT:
1080       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
1081       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1082       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1083       data->W = basis_data->d_qweight1d;
1084       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1085       break; // No action
1086     case CEED_EVAL_DIV:
1087       break; // TODO: Not implemented
1088     case CEED_EVAL_CURL:
1089       break; // TODO: Not implemented
1090     }
1091   }
1092 
1093   // Q function
1094   code << "\n// Output field setup\n";
1095   for (CeedInt i = 0; i < numoutputfields; i++) {
1096       code << "  // -- Output field "<<i<<" --\n";
1097     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1098     CeedChk(ierr);
1099     if (emode==CEED_EVAL_GRAD)
1100     {
1101       if (basis_data->d_collograd1d) {
1102         //Accumulator for gradient slices
1103         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1104         code << "  for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1105         code << "    for (CeedInt j = 0; j < Q1d; ++j) {\n";
1106         code << "      r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1107         code << "    }\n";
1108         code << "  }\n";
1109       } else {
1110         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1111       }
1112     }
1113     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1114     {
1115       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1116     }
1117   }
1118   //We treat quadrature points per slice in 3d to save registers
1119   if (basis_data->d_collograd1d) {
1120     code << "\n  // Note: Collocated Gradient\n";
1121     code << "#pragma unroll\n";
1122     code << "for (CeedInt q=0; q<Q1d; q++) {\n";
1123     for (CeedInt i = 0; i < numinputfields; i++) {
1124       code << "  // -- Input field "<<i<<" --\n";
1125       // Get elemsize, emode, ncomp
1126       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1127       CeedChk(ierr);
1128       // Basis action
1129       code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1130       switch (emode) {
1131       case CEED_EVAL_NONE:
1132         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1133         ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1134         data->indices.in[i] = restr_data->d_ind;
1135         if (data->indices.in[i]) {
1136           ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
1137           code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
1138           ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1139           code << "  // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n";
1140           code << "  readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nnodes_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1141         } else {
1142           bool backendstrides;
1143           ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict,
1144                                                             &backendstrides);
1145           CeedChk(ierr);
1146           CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1147           if (!backendstrides) {
1148             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1149             CeedChk(ierr);
1150           }
1151           code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1152           code << "  readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1153         }
1154         break;
1155       case CEED_EVAL_INTERP:
1156         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1157         code << "  for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1158         code << "    r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1159         code << "  }\n";
1160         break;
1161       case CEED_EVAL_GRAD:
1162         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1163         code << "  gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1164         break;
1165       case CEED_EVAL_WEIGHT:
1166         code << "  CeedScalar r_q"<<i<<"[1];\n";
1167         code << "  r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1168         break; // No action
1169       case CEED_EVAL_DIV:
1170         break; // TODO: Not implemented
1171       case CEED_EVAL_CURL:
1172         break; // TODO: Not implemented
1173       }
1174     }
1175     for (CeedInt i = 0; i < numoutputfields; i++) {
1176       code << "  // -- Output field "<<i<<" --\n";
1177       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1178       CeedChk(ierr);
1179       // Basis action
1180       switch (emode) {
1181       case CEED_EVAL_NONE:
1182         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1183         break; // No action
1184       case CEED_EVAL_INTERP:
1185         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1186         break;
1187       case CEED_EVAL_GRAD:
1188         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1189         break;
1190       case CEED_EVAL_WEIGHT:
1191         break; // Should not occur
1192       case CEED_EVAL_DIV:
1193         break; // TODO: Not implemented
1194       case CEED_EVAL_CURL:
1195         break; // TODO: Not implemented
1196       }
1197     }
1198   } else {
1199       code << "\n  // Note: No Collocated Gradient\n";
1200     for (CeedInt i = 0; i < numinputfields; i++) {
1201       code << "  // -- Input field "<<i<<" --\n";
1202       code << "  CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1203     }
1204     for (CeedInt i = 0; i < numoutputfields; i++) {
1205       code << "  // -- Output field "<<i<<" --\n";
1206       code << "  CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1207     }
1208   }
1209   code << "  // QFunction Inputs and outputs\n";
1210   code << "  CeedScalar* in["<<numinputfields<<"];\n";
1211   for (CeedInt i = 0; i < numinputfields; i++) {
1212     code << "  // -- Input field "<<i<<" --\n";
1213     code << "  in["<<i<<"] = r_q"<<i<<";\n";
1214   }
1215   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
1216   for (CeedInt i = 0; i < numoutputfields; i++) {
1217     code << "  // -- Output field "<<i<<" --\n";
1218     code << "  out["<<i<<"] = r_qq"<<i<<";\n";
1219   }
1220   code << "\n  // Apply QFunction\n";
1221   string qFunctionName(qf_data->qFunctionName);
1222   code << "  "<<qFunctionName<<"(ctx, ";
1223   if (dim != 3 || basis_data->d_collograd1d) {
1224     code << "1 ";
1225   } else {
1226     code << "Q1d ";
1227   }
1228   code << ", in, out);\n";
1229   if (basis_data->d_collograd1d) {
1230     code << "\n  // Note: Collocated Gradient\n";
1231     for (CeedInt i = 0; i < numoutputfields; i++) {
1232       code << "  // -- Output field "<<i<<" --\n";
1233       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1234       CeedChk(ierr);
1235       // Basis action
1236       code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1237       switch (emode) {
1238       case CEED_EVAL_NONE:
1239         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1240         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1241         code << "  }\n";
1242         break; // No action
1243       case CEED_EVAL_INTERP:
1244         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1245         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1246         code << "  }\n";
1247         break;
1248       case CEED_EVAL_GRAD:
1249         code << "  gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1250         break;
1251       case CEED_EVAL_WEIGHT:
1252         break; // Should not occur
1253       case CEED_EVAL_DIV:
1254         break; // TODO: Not implemented
1255       case CEED_EVAL_CURL:
1256         break; // TODO: Not implemented
1257       }
1258     }
1259     code << "}\n";
1260   }
1261 
1262   // Output basis apply if needed
1263   // Generate the correct eval mode code for each output
1264   code << "\n// Output field basis action and restrictions\n";
1265   for (CeedInt i = 0; i < numoutputfields; i++) {
1266     code << "  // -- Output field "<<i<<" --\n";
1267     // Get elemsize, emode, ncomp
1268     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1269     CeedChk(ierr);
1270     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1271     CeedChk(ierr);
1272     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1273     CeedChk(ierr);
1274     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1275     CeedChk(ierr);
1276     // Basis action
1277     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1278     switch (emode) {
1279     case CEED_EVAL_NONE:
1280       code << "  CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1281       break; // No action
1282     case CEED_EVAL_INTERP:
1283       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1284       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1285       break;
1286     case CEED_EVAL_GRAD:
1287       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1288       if (basis_data->d_collograd1d) {
1289         code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1290       } else {
1291         code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1292       }
1293       break;
1294     case CEED_EVAL_WEIGHT: {
1295       Ceed ceed;
1296       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1297       return CeedError(ceed, 1,
1298                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1299       break; // Should not occur
1300     }
1301     case CEED_EVAL_DIV:
1302       break; // TODO: Not implemented
1303     case CEED_EVAL_CURL:
1304       break; // TODO: Not implemented
1305     }
1306     // Restriction
1307     ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1308     data->indices.out[i] = restr_data->d_ind;
1309     if (data->indices.out[i]) {
1310       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
1311       code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
1312       ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1313       code << "  // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n";
1314       code << "  writeDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1315     } else {
1316       bool backendstrides;
1317       ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict,
1318                                                         &backendstrides);
1319       CeedChk(ierr);
1320       CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1321       if (!backendstrides) {
1322         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1323         CeedChk(ierr);
1324       }
1325       code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1326       code << "  writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1327     }
1328   }
1329 
1330   code << "  }\n";
1331   code << "}\n\n";
1332 
1333 //  std::cout << code.str();
1334 
1335   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0);
1336   CeedChk(ierr);
1337   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1338   CeedChk(ierr);
1339 
1340   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1341 
1342   return 0;
1343 }
1344