xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 7d8d0e25636a94a27ff75b3dec09737e24cdb0fe)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 #define CEED_DEBUG_COLOR 12
17 
18 #include "ceed-cuda-gen.h"
19 #include <iostream>
20 #include <sstream>
21 #include "../cuda-shared/ceed-cuda-shared.h"
22 
23 static const char *atomicAdd = QUOTE(
24 //------------------------------------------------------------------------------
25 // Atomic add, for older CUDA
26 //------------------------------------------------------------------------------
27 __device__ double atomicAdd(double *address, double val) {
28   unsigned long long int *address_as_ull = (unsigned long long int *)address;
29   unsigned long long int old = *address_as_ull, assumed;
30   do {
31     assumed = old;
32     old =
33       atomicCAS(address_as_ull, assumed,
34                 __double_as_longlong(val +
35                                      __longlong_as_double(assumed)));
36     // Note: uses integer comparison to avoid hang in case of NaN
37     // (since NaN != NaN)
38   } while (assumed != old);
39   return __longlong_as_double(old);
40 }
41 );
42 
43 static const char *deviceFunctions = QUOTE(
44 
45 //------------------------------------------------------------------------------
46 // Typedefs
47 //------------------------------------------------------------------------------
48 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
49 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
50 
51 typedef struct {
52   CeedInt tidx;
53   CeedInt tidy;
54   CeedInt tidz;
55   CeedInt tid;
56   CeedScalar* slice;
57 } BackendData;
58 
59 //------------------------------------------------------------------------------
60 // Load matrices for basis actions
61 //------------------------------------------------------------------------------
62 template <int P, int Q>
63 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
64   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
65     B[i] = d_B[i];
66 }
67 
68 //------------------------------------------------------------------------------
69 // 1D
70 //------------------------------------------------------------------------------
71 
72 //------------------------------------------------------------------------------
73 // L-vector -> E-vector, offsets provided
74 //------------------------------------------------------------------------------
75 template <int NCOMP, int COMPSTRIDE, int P1d>
76 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
77   if (data.tidx < P1d) {
78     const CeedInt node = data.tidx;
79     const CeedInt ind = indices[node + elem * P1d];
80     for (CeedInt comp = 0; comp < NCOMP; ++comp)
81       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
82   }
83 }
84 
85 //------------------------------------------------------------------------------
86 // L-vector -> E-vector, strided
87 //------------------------------------------------------------------------------
88 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
89 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
90   if (data.tidx < P1d) {
91     const CeedInt node = data.tidx;
92     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
93     for (CeedInt comp = 0; comp < NCOMP; ++comp)
94       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
95   }
96 }
97 
98 //------------------------------------------------------------------------------
99 // E-vector -> L-vector, offsets provided
100 //------------------------------------------------------------------------------
101 template <int NCOMP, int COMPSTRIDE, int P1d>
102 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
103   if (data.tidx < P1d) {
104     const CeedInt node = data.tidx;
105     const CeedInt ind = indices[node + elem * P1d];
106     for (CeedInt comp = 0; comp < NCOMP; ++comp)
107       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
108   }
109 }
110 
111 //------------------------------------------------------------------------------
112 // E-vector -> L-vector, strided
113 //------------------------------------------------------------------------------
114 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
115 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
116   if (data.tidx < P1d) {
117     const CeedInt node = data.tidx;
118     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
119     for (CeedInt comp = 0; comp < NCOMP; ++comp)
120       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
121   }
122 }
123 
124 //------------------------------------------------------------------------------
125 // 1D tensor contraction x
126 //------------------------------------------------------------------------------
127 template <int NCOMP, int P1d, int Q1d>
128 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
129   data.slice[data.tidx] = *U;
130   __syncthreads();
131   *V = 0.0;
132   if (data.tidx < Q1d)
133     for (CeedInt i = 0; i < P1d; ++i)
134       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
135   __syncthreads();
136 }
137 
138 //------------------------------------------------------------------------------
139 // 1D transpose tensor contraction x
140 //------------------------------------------------------------------------------
141 template <int NCOMP, int P1d, int Q1d>
142 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
143   data.slice[data.tidx] = *U;
144   __syncthreads();
145   *V = 0.0;
146   if (data.tidx < P1d)
147     for (CeedInt i = 0; i < Q1d; ++i)
148       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
149   __syncthreads();
150 }
151 
152 //------------------------------------------------------------------------------
153 // 1D interpolate to quadrature points
154 //------------------------------------------------------------------------------
155 template <int NCOMP, int P1d, int Q1d>
156 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
157   for (CeedInt comp = 0; comp < NCOMP; comp++)
158     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
159 }
160 
161 //------------------------------------------------------------------------------
162 // 1D interpolate transpose
163 //------------------------------------------------------------------------------
164 template <int NCOMP, int P1d, int Q1d>
165 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
166   for (CeedInt comp=0; comp<NCOMP; comp++)
167     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
168 }
169 
170 //------------------------------------------------------------------------------
171 // 1D derivatives at quadrature points
172 //------------------------------------------------------------------------------
173 template <int NCOMP, int P1d, int Q1d>
174 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
175   for (CeedInt comp = 0; comp < NCOMP; comp++)
176     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
177 }
178 
179 //------------------------------------------------------------------------------
180 // 1D derivatives transpose
181 //------------------------------------------------------------------------------
182 template <int NCOMP, int P1d, int Q1d>
183 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
184   for (CeedInt comp = 0; comp < NCOMP; comp++)
185     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
186 }
187 
188 //------------------------------------------------------------------------------
189 // 2D
190 //------------------------------------------------------------------------------
191 
192 //------------------------------------------------------------------------------
193 // L-vector -> E-vector, offsets provided
194 //------------------------------------------------------------------------------
195 template <int NCOMP, int COMPSTRIDE, int P1d>
196 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
197   if (data.tidx < P1d && data.tidy < P1d) {
198     const CeedInt node = data.tidx + data.tidy*P1d;
199     const CeedInt ind = indices[node + elem * P1d*P1d];
200     for (CeedInt comp = 0; comp < NCOMP; ++comp)
201       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
202   }
203 }
204 
205 //------------------------------------------------------------------------------
206 // L-vector -> E-vector, strided
207 //------------------------------------------------------------------------------
208 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
209 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
210   if (data.tidx < P1d && data.tidy < P1d) {
211     const CeedInt node = data.tidx + data.tidy*P1d;
212     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
213     for (CeedInt comp = 0; comp < NCOMP; ++comp)
214       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
215   }
216 }
217 
218 //------------------------------------------------------------------------------
219 // E-vector -> L-vector, offsets provided
220 //------------------------------------------------------------------------------
221 template <int NCOMP, int COMPSTRIDE, int P1d>
222 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
223   if (data.tidx < P1d && data.tidy < P1d) {
224     const CeedInt node = data.tidx + data.tidy*P1d;
225     const CeedInt ind = indices[node + elem * P1d*P1d];
226     for (CeedInt comp = 0; comp < NCOMP; ++comp)
227       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
228   }
229 }
230 
231 //------------------------------------------------------------------------------
232 // E-vector -> L-vector, strided
233 //------------------------------------------------------------------------------
234 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
235 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
236   if (data.tidx < P1d && data.tidy < P1d) {
237     const CeedInt node = data.tidx + data.tidy*P1d;
238     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
239     for (CeedInt comp = 0; comp < NCOMP; ++comp)
240       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
241   }
242 }
243 
244 //------------------------------------------------------------------------------
245 // 2D tensor contraction x
246 //------------------------------------------------------------------------------
247 template <int NCOMP, int P1d, int Q1d>
248 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
249   data.slice[data.tidx+data.tidy*T1d] = *U;
250   __syncthreads();
251   *V = 0.0;
252   if (data.tidx < Q1d && data.tidy < P1d)
253     for (CeedInt i = 0; i < P1d; ++i)
254       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
255   __syncthreads();
256 }
257 
258 //------------------------------------------------------------------------------
259 // 2D tensor contract y
260 //------------------------------------------------------------------------------
261 template <int NCOMP, int P1d, int Q1d>
262 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
263   data.slice[data.tidx+data.tidy*T1d] = *U;
264   __syncthreads();
265   *V = 0.0;
266   if (data.tidx < Q1d && data.tidy < Q1d)
267     for (CeedInt i = 0; i < P1d; ++i)
268       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
269   __syncthreads();
270 }
271 
272 //------------------------------------------------------------------------------
273 // 2D transpose tensor contract y
274 //------------------------------------------------------------------------------
275 template <int NCOMP, int P1d, int Q1d>
276 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
277   data.slice[data.tidx+data.tidy*T1d] = *U;
278   __syncthreads();
279   *V = 0.0;
280   if (data.tidx < Q1d && data.tidy < P1d)
281     for (CeedInt i = 0; i < Q1d; ++i)
282       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
283   __syncthreads();
284 }
285 
286 //------------------------------------------------------------------------------
287 // 2D transpose tensor contract x
288 //------------------------------------------------------------------------------
289 template <int NCOMP, int P1d, int Q1d>
290 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
291   data.slice[data.tidx+data.tidy*T1d] = *U;
292   __syncthreads();
293   *V = 0.0;
294   if (data.tidx < P1d && data.tidy < P1d)
295     for (CeedInt i = 0; i < Q1d; ++i)
296       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
297   __syncthreads();
298 }
299 
300 //------------------------------------------------------------------------------
301 // 2D transpose tensor contract and add x
302 //------------------------------------------------------------------------------
303 template <int NCOMP, int P1d, int Q1d>
304 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
305   data.slice[data.tidx+data.tidy*T1d] = *U;
306   __syncthreads();
307   if (data.tidx < P1d && data.tidy < P1d)
308     for (CeedInt i = 0; i < Q1d; ++i)
309       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
310   __syncthreads();
311 }
312 
313 //------------------------------------------------------------------------------
314 // 2D interpolate to quadrature points
315 //------------------------------------------------------------------------------
316 template <int NCOMP, int P1d, int Q1d>
317 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
318   CeedScalar r_t[1];
319   for (CeedInt comp = 0; comp < NCOMP; comp++) {
320     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
321     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
322   }
323 }
324 
325 //------------------------------------------------------------------------------
326 // 2D interpolate transpose
327 //------------------------------------------------------------------------------
328 template <int NCOMP, int P1d, int Q1d>
329 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
330   CeedScalar r_t[1];
331   for (CeedInt comp = 0; comp < NCOMP; comp++) {
332     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
333     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
334   }
335 }
336 
337 //------------------------------------------------------------------------------
338 // 2D derivatives at quadrature points
339 //------------------------------------------------------------------------------
340 template <int NCOMP, int P1d, int Q1d>
341 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
342   CeedScalar r_t[1];
343   for (CeedInt comp = 0; comp < NCOMP; comp++) {
344     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
345     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
346     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
347     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
348   }
349 }
350 
351 //------------------------------------------------------------------------------
352 // 2D derivatives transpose
353 //------------------------------------------------------------------------------
354 template <int NCOMP, int P1d, int Q1d>
355 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
356   CeedScalar r_t[1];
357   for (CeedInt comp = 0; comp < NCOMP; comp++) {
358     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
359     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
360     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
361     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
362   }
363 }
364 
365 //------------------------------------------------------------------------------
366 // 3D
367 //------------------------------------------------------------------------------
368 
369 //------------------------------------------------------------------------------
370 // L-vector -> E-vector, offsets provided
371 //------------------------------------------------------------------------------
372 template <int NCOMP, int COMPSTRIDE, int P1d>
373 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
374   if (data.tidx < P1d && data.tidy < P1d)
375     for (CeedInt z = 0; z < P1d; ++z) {
376       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
377       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
378       for (CeedInt comp = 0; comp < NCOMP; ++comp)
379         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
380     }
381 }
382 
383 //------------------------------------------------------------------------------
384 // L-vector -> E-vector, strided
385 //------------------------------------------------------------------------------
386 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
387 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
388   if (data.tidx < P1d && data.tidy < P1d)
389     for (CeedInt z = 0; z < P1d; ++z) {
390       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
391       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
392       for (CeedInt comp = 0; comp < NCOMP; ++comp)
393         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
394     }
395 }
396 
397 //------------------------------------------------------------------------------
398 // E-vector -> Q-vector, offests provided
399 //------------------------------------------------------------------------------
400 template <int NCOMP, int COMPSTRIDE, int Q1d>
401 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
402   if (data.tidx < Q1d && data.tidy < Q1d) {
403     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
404     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
405     for (CeedInt comp = 0; comp < NCOMP; ++comp)
406       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
407   }
408 }
409 
410 //------------------------------------------------------------------------------
411 // E-vector -> Q-vector, strided
412 //------------------------------------------------------------------------------
413 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
414 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
415   if (data.tidx < Q1d && data.tidy < Q1d) {
416     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
417     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
418     for (CeedInt comp = 0; comp < NCOMP; ++comp)
419       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
420   }
421 }
422 
423 //------------------------------------------------------------------------------
424 // E-vector -> L-vector, offsets provided
425 //------------------------------------------------------------------------------
426 template <int NCOMP, int COMPSTRIDE, int P1d>
427 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
428   if (data.tidx < P1d && data.tidy < P1d)
429     for (CeedInt z = 0; z < P1d; ++z) {
430       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
431       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
432       for (CeedInt comp = 0; comp < NCOMP; ++comp)
433         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
434     }
435 }
436 
437 //------------------------------------------------------------------------------
438 // E-vector -> L-vector, strided
439 //------------------------------------------------------------------------------
440 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
441 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
442   if (data.tidx < P1d && data.tidy < P1d)
443     for (CeedInt z = 0; z < P1d; ++z) {
444       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
445       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
446       for (CeedInt comp = 0; comp < NCOMP; ++comp)
447         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
448     }
449 }
450 
451 //------------------------------------------------------------------------------
452 // 3D tensor contract x
453 //------------------------------------------------------------------------------
454 template <int NCOMP, int P1d, int Q1d>
455 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
456   CeedScalar r_B[P1d];
457   for (CeedInt i = 0; i < P1d; ++i)
458     r_B[i] = B[i + data.tidx*P1d];
459 
460   for (CeedInt k = 0; k < P1d; ++k) {
461     data.slice[data.tidx+data.tidy*T1d] = U[k];
462     __syncthreads();
463     V[k] = 0.0;
464     if (data.tidx < Q1d && data.tidy < P1d)
465       for (CeedInt i = 0; i < P1d; ++i)
466         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
467     __syncthreads();
468   }
469 }
470 
471 //------------------------------------------------------------------------------
472 // 3D tensor contract y
473 //------------------------------------------------------------------------------
474 template <int NCOMP, int P1d, int Q1d>
475 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
476   CeedScalar r_B[P1d];
477   for (CeedInt i = 0; i < P1d; ++i)
478     r_B[i] = B[i + data.tidy*P1d];
479 
480   for (CeedInt k = 0; k < P1d; ++k) {
481     data.slice[data.tidx+data.tidy*T1d] = U[k];
482     __syncthreads();
483     V[k] = 0.0;
484     if (data.tidx < Q1d && data.tidy < Q1d)
485       for (CeedInt i = 0; i < P1d; ++i)
486         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
487     __syncthreads();
488   }
489 }
490 
491 //------------------------------------------------------------------------------
492 // 3D tensor contract z
493 //------------------------------------------------------------------------------
494 template <int NCOMP, int P1d, int Q1d>
495 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
496   for (CeedInt k = 0; k < Q1d; ++k) {
497     V[k] = 0.0;
498     if (data.tidx < Q1d && data.tidy < Q1d)
499       for (CeedInt i = 0; i < P1d; ++i)
500         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
501   }
502 }
503 
504 //------------------------------------------------------------------------------
505 // 3D transpose tensor contract z
506 //------------------------------------------------------------------------------
507 template <int NCOMP, int P1d, int Q1d>
508 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
509   for (CeedInt k = 0; k < P1d; ++k) {
510     V[k] = 0.0;
511     if (data.tidx < Q1d && data.tidy < Q1d)
512       for (CeedInt i = 0; i < Q1d; ++i)
513         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
514   }
515 }
516 
517 //------------------------------------------------------------------------------
518 // 3D transpose tensor contract y
519 //------------------------------------------------------------------------------
520 template <int NCOMP, int P1d, int Q1d>
521 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
522   CeedScalar r_B[Q1d];
523   for (CeedInt i = 0; i < Q1d; ++i)
524     r_B[i] = B[data.tidy + i*P1d];
525 
526   for (CeedInt k = 0; k < P1d; ++k) {
527     data.slice[data.tidx+data.tidy*T1d] = U[k];
528     __syncthreads();
529     V[k] = 0.0;
530     if (data.tidx < Q1d && data.tidy < P1d)
531       for (CeedInt i = 0; i < Q1d; ++i)
532         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
533     __syncthreads();
534   }
535 }
536 
537 //------------------------------------------------------------------------------
538 // 3D transpose tensor contract add y
539 //------------------------------------------------------------------------------
540 template <int NCOMP, int P1d, int Q1d>
541 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
542   CeedScalar r_B[Q1d];
543   for (CeedInt i = 0; i < Q1d; ++i)
544     r_B[i] = B[data.tidy + i*P1d];
545 
546   for (CeedInt k = 0; k < P1d; ++k) {
547     data.slice[data.tidx+data.tidy*T1d] = U[k];
548     __syncthreads();
549     if (data.tidx < Q1d && data.tidy < P1d)
550       for (CeedInt i = 0; i < Q1d; ++i)
551         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
552     __syncthreads();
553   }
554 }
555 
556 //------------------------------------------------------------------------------
557 // 3D transpose tensor contract x
558 //------------------------------------------------------------------------------
559 template <int NCOMP, int P1d, int Q1d>
560 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
561   CeedScalar r_B[Q1d];
562   for (CeedInt i = 0; i < Q1d; ++i)
563     r_B[i] = B[data.tidx + i*P1d];
564 
565   for (CeedInt k = 0; k < P1d; ++k) {
566     data.slice[data.tidx+data.tidy*T1d] = U[k];
567     __syncthreads();
568     V[k] = 0.0;
569     if (data.tidx < P1d && data.tidy < P1d)
570       for (CeedInt i = 0; i < Q1d; ++i)
571         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
572     __syncthreads();
573   }
574 }
575 
576 //------------------------------------------------------------------------------
577 // 3D transpose tensor contract add x
578 //------------------------------------------------------------------------------
579 template <int NCOMP, int P1d, int Q1d>
580 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
581   CeedScalar r_B[Q1d];
582   for (CeedInt i = 0; i < Q1d; ++i)
583     r_B[i] = B[data.tidx + i*P1d];
584 
585   for (CeedInt k = 0; k < P1d; ++k) {
586     data.slice[data.tidx+data.tidy*T1d] = U[k];
587     __syncthreads();
588     if (data.tidx < P1d && data.tidy < P1d)
589       for (CeedInt i = 0; i < Q1d; ++i)
590         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
591     __syncthreads();
592   }
593 }
594 
595 //------------------------------------------------------------------------------
596 // 3D interpolate to quadrature points
597 //------------------------------------------------------------------------------
598 template <int NCOMP, int P1d, int Q1d>
599 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
600   CeedScalar r_t1[T1d];
601   CeedScalar r_t2[T1d];
602   for (CeedInt comp = 0; comp < NCOMP; comp++) {
603     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
604     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
605     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
606   }
607 }
608 
609 //------------------------------------------------------------------------------
610 // 3D interpolate transpose
611 //------------------------------------------------------------------------------
612 template <int NCOMP, int P1d, int Q1d>
613 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
614   CeedScalar r_t1[T1d];
615   CeedScalar r_t2[T1d];
616   for (CeedInt comp = 0; comp < NCOMP; comp++) {
617     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
618     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
619     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
620   }
621 }
622 
623 //------------------------------------------------------------------------------
624 // 3D derivatives at quadrature points
625 //------------------------------------------------------------------------------
626 template <int NCOMP, int P1d, int Q1d>
627 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
628   CeedScalar r_t1[T1d];
629   CeedScalar r_t2[T1d];
630   for (CeedInt comp = 0; comp < NCOMP; comp++) {
631     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
632     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
633     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
634     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
635     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
636     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
637     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
638     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
639     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
640   }
641 }
642 
643 //------------------------------------------------------------------------------
644 // 3D derivatives transpose
645 //------------------------------------------------------------------------------
646 template <int NCOMP, int P1d, int Q1d>
647 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
648   CeedScalar r_t1[T1d];
649   CeedScalar r_t2[T1d];
650   for (CeedInt comp = 0; comp < NCOMP; comp++) {
651     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
652     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
653     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
654     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
655     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
656     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
657     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
658     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
659     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
660   }
661 }
662 
663 //------------------------------------------------------------------------------
664 // 3D collocated derivatives computation
665 //------------------------------------------------------------------------------
666 template <int NCOMP, int Q1d>
667 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
668   if (data.tidx < Q1d && data.tidy < Q1d) {
669     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
670       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
671       __syncthreads();
672       // X derivative
673       r_V[comp+0*NCOMP] = 0.0;
674       for (CeedInt i = 0; i < Q1d; ++i)
675         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
676       // Y derivative
677       r_V[comp+1*NCOMP] = 0.0;
678       for (CeedInt i = 0; i < Q1d; ++i)
679         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
680       // Z derivative
681       r_V[comp+2*NCOMP] = 0.0;
682       for (CeedInt i = 0; i < Q1d; ++i)
683         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
684       __syncthreads();
685     }
686   }
687 }
688 
689 //------------------------------------------------------------------------------
690 // 3D collocated derivatives transpose
691 //------------------------------------------------------------------------------
692 template <int NCOMP, int Q1d>
693 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
694   if (data.tidx < Q1d && data.tidy < Q1d) {
695     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
696       // X derivative
697       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
698       __syncthreads();
699       for (CeedInt i = 0; i < Q1d; ++i)
700         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
701       __syncthreads();
702       // Y derivative
703       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
704       __syncthreads();
705       for (CeedInt i = 0; i < Q1d; ++i)
706         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
707       __syncthreads();
708       // Z derivative
709       for (CeedInt i = 0; i < Q1d; ++i)
710         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
711     }
712   }
713 }
714 
715 //------------------------------------------------------------------------------
716 // 1D quadrature weights
717 //------------------------------------------------------------------------------
718 template <int Q1d>
719 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
720   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
721 }
722 
723 //------------------------------------------------------------------------------
724 // 2D quadrature weights
725 //------------------------------------------------------------------------------
726 template <int Q1d>
727 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
728   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
729         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
730 }
731 
732 //------------------------------------------------------------------------------
733 // 3D quadrature weights
734 //------------------------------------------------------------------------------
735 template <int Q1d>
736 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
737   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
738   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
739   for (CeedInt z = 0; z < Q1d; ++z)
740     w[z] = quad ? pw*qweight1d[z] : 0.0;
741 }
742 
743 );
744 //------------------------------------------------------------------------------
745 // Build singe operator kernel
746 //------------------------------------------------------------------------------
747 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
748 
749   using std::ostringstream;
750   using std::string;
751   int ierr;
752   bool setupdone;
753   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
754   if (setupdone) return 0;
755   Ceed ceed;
756   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
757   CeedOperator_Cuda_gen *data;
758   ierr = CeedOperatorGetData(op, &data); CeedChk(ierr);
759   CeedQFunction qf;
760   CeedQFunction_Cuda_gen *qf_data;
761   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
762   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr);
763   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
764           numoutputfields, ncomp, dim = 0, lsize;
765   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
766   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
767   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
768   CeedChk(ierr);
769   CeedOperatorField *opinputfields, *opoutputfields;
770   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
771   CeedChk(ierr);
772   CeedQFunctionField *qfinputfields, *qfoutputfields;
773   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
774   CeedChk(ierr);
775   CeedEvalMode emode;
776   CeedBasis basis;
777   CeedBasis_Cuda_shared *basis_data;
778   CeedElemRestriction Erestrict;
779   CeedElemRestriction_Cuda *restr_data;
780 
781   ostringstream code;
782   string devFunctions(deviceFunctions);
783 
784   // Add atomicAdd function for old NVidia architectures
785   struct cudaDeviceProp prop;
786   Ceed_Cuda *ceed_data;
787   ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr);
788   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
789   if (prop.major<6){
790     code << atomicAdd;
791   }
792 
793   code << devFunctions;
794 
795   string qFunction(qf_data->qFunctionSource);
796   string qFunctionName(qf_data->qFunctionName);
797   string oper;
798   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
799 
800   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
801   code << "\n#define CeedPragmaSIMD\n";
802 
803   // Find dim and Q1d
804   bool useCollograd = true;
805   data->maxP1d = 0;
806   for (CeedInt i = 0; i < numinputfields; i++) {
807     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
808     if (basis != CEED_BASIS_COLLOCATED) {
809       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
810       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
811       CeedChk(ierr);
812 
813       // Check for collocated gradient
814       useCollograd = useCollograd && basis_data->d_collograd1d;
815 
816       // Collect dim and Q1d
817       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
818       bool isTensor;
819       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
820       if (isTensor) {
821         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
822         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
823         if (P1d>data->maxP1d) data->maxP1d = P1d;
824       } else {
825         // LCOV_EXCL_START
826         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
827         // LCOV_EXCL_STOP
828         }
829     }
830   }
831   // Check output bases for Q1d, dim as well
832   //   The only imput basis might be CEED_BASIS_COLLOCATED
833   for (CeedInt i = 0; i < numoutputfields; i++) {
834     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
835 
836     if (basis != CEED_BASIS_COLLOCATED) {
837       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
838       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
839       CeedChk(ierr);
840 
841       // Collect dim and Q1d
842       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
843       bool isTensor;
844       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
845       if (isTensor) {
846         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
847       } else {
848         // LCOV_EXCL_START
849         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
850         // LCOV_EXCL_STOP
851         }
852 
853       // Check for collocated gradient
854       useCollograd = useCollograd && basis_data->d_collograd1d;
855     }
856   }
857   data->dim = dim;
858   data->Q1d = Q1d;
859 
860   // Define CEED_Q_VLA
861   if (dim != 3 || useCollograd) {
862     code << "\n#define CEED_Q_VLA 1\n\n";
863   } else {
864     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
865   }
866 
867   code << qFunction;
868 
869   // Setup
870   code << "\n// -----------------------------------------------------------------------------\n";
871   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
872   for (CeedInt i = 0; i < numinputfields; i++) {
873     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
874     CeedChk(ierr);
875     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
876       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
877     }
878   }
879 
880   for (CeedInt i = 0; i < numoutputfields; i++) {
881     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
882   }
883 
884   code << "  const CeedInt Dim = "<<dim<<";\n";
885   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
886 
887   code << "  extern __shared__ CeedScalar slice[];\n";
888   code << "  BackendData data;\n";
889   code << "  data.tidx = threadIdx.x;\n";
890   code << "  data.tidy = threadIdx.y;\n";
891   code << "  data.tidz = threadIdx.z;\n";
892   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
893   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
894 
895   code << "\n  // -- Input field constants and basis data --\n";
896   //Initialize constants, and matrices B and G
897   for (CeedInt i = 0; i < numinputfields; i++) {
898     code << "  // ---- Input field "<<i<<" ----\n";
899     // Get elemsize, emode, ncomp
900     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
901     CeedChk(ierr);
902     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
903     CeedChk(ierr);
904     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
905     CeedChk(ierr);
906     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
907     CeedChk(ierr);
908 
909     // Set field constants
910     if (emode != CEED_EVAL_WEIGHT) {
911       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
912       if (basis != CEED_BASIS_COLLOCATED) {
913         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
914         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
915       } else {
916         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
917       }
918       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
919     }
920 
921     // Load basis data
922     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
923     switch (emode) {
924     case CEED_EVAL_NONE:
925       break;
926     case CEED_EVAL_INTERP:
927       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
928       data->B.in[i] = basis_data->d_interp1d;
929       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
930       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
931       break;
932     case CEED_EVAL_GRAD:
933       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
934       data->B.in[i] = basis_data->d_interp1d;
935       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
936       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
937       if (useCollograd) {
938         data->G.in[i] = basis_data->d_collograd1d;
939         code << "  __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
940         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
941       } else {
942         data->G.in[i] = basis_data->d_grad1d;
943         code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
944         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
945       }
946       break;
947     case CEED_EVAL_WEIGHT:
948       break; // No action
949     case CEED_EVAL_DIV:
950       break; // TODO: Not implemented
951     case CEED_EVAL_CURL:
952       break; // TODO: Not implemented
953     }
954   }
955 
956   code << "\n  // -- Output field constants and basis data --\n";
957   for (CeedInt i = 0; i < numoutputfields; i++) {
958     code << "  // ---- Output field "<<i<<" ----\n";
959     // Get elemsize, emode, ncomp
960     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
961     CeedChk(ierr);
962     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
963     CeedChk(ierr);
964     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
965     CeedChk(ierr);
966     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
967     CeedChk(ierr);
968 
969     // Set field constants
970     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
971     if (basis != CEED_BASIS_COLLOCATED) {
972       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
973       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
974     } else {
975       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
976     }
977     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
978 
979     // Load basis data
980     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
981     switch (emode) {
982     case CEED_EVAL_NONE:
983       break; // No action
984     case CEED_EVAL_INTERP:
985       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
986       data->B.out[i] = basis_data->d_interp1d;
987       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
988       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
989       break;
990     case CEED_EVAL_GRAD:
991       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
992       data->B.out[i] = basis_data->d_interp1d;
993       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
994       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
995       if (useCollograd) {
996         data->G.out[i] = basis_data->d_collograd1d;
997         code << "  __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
998         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
999       } else {
1000         data->G.out[i] = basis_data->d_grad1d;
1001         code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1002         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1003       }
1004       break;
1005     // LCOV_EXCL_START
1006     case CEED_EVAL_WEIGHT: {
1007       Ceed ceed;
1008       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1009       return CeedError(ceed, 1,
1010                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1011       break; // Should not occur
1012     }
1013     case CEED_EVAL_DIV:
1014       break; // TODO: Not implemented
1015     case CEED_EVAL_CURL:
1016       break; // TODO: Not implemented
1017       // LCOV_EXCL_STOP
1018     }
1019   }
1020   code << "\n  // -- Element loop --\n";
1021   code << "  __syncthreads();\n";
1022   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1023   // Input basis apply if needed
1024   // Generate the correct eval mode code for each input
1025   code << "    // -- Input field restrictions and basis actions --\n";
1026   for (CeedInt i = 0; i < numinputfields; i++) {
1027     code << "    // ---- Input field "<<i<<" ----\n";
1028     // Get elemsize, emode, ncomp
1029     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1030     CeedChk(ierr);
1031     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1032     CeedChk(ierr);
1033     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1034     CeedChk(ierr);
1035     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1036     CeedChk(ierr);
1037 
1038     // Restriction
1039     if (emode != CEED_EVAL_WEIGHT &&
1040         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1041       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1042 
1043       bool isStrided;
1044       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1045       if (!isStrided) {
1046         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1047         CeedChk(ierr);
1048         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1049         CeedInt compstride;
1050         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1051         code << "    // CompStride: "<<compstride<<"\n";
1052         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1053         data->indices.in[i] = restr_data->d_ind;
1054         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1055       } else {
1056         bool backendstrides;
1057         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1058         CeedChk(ierr);
1059         CeedInt nelem;
1060         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1061         CeedChk(ierr);
1062         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1063         if (!backendstrides) {
1064           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1065           CeedChk(ierr);
1066         }
1067         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1068         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1069       }
1070     }
1071 
1072     // Basis action
1073     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1074     switch (emode) {
1075     case CEED_EVAL_NONE:
1076       if (!useCollograd) {
1077         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1078       }
1079       break;
1080     case CEED_EVAL_INTERP:
1081       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1082       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1083       break;
1084     case CEED_EVAL_GRAD:
1085       if (useCollograd) {
1086         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1087         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1088       } else {
1089         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1090         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1091       }
1092       break;
1093     case CEED_EVAL_WEIGHT:
1094       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1095       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1096       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
1097       data->W = basis_data->d_qweight1d;
1098       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1099       break; // No action
1100     case CEED_EVAL_DIV:
1101       break; // TODO: Not implemented
1102     case CEED_EVAL_CURL:
1103       break; // TODO: Not implemented
1104     }
1105   }
1106 
1107   // Q function
1108   code << "\n    // -- Output field setup --\n";
1109   for (CeedInt i = 0; i < numoutputfields; i++) {
1110       code << "\n    // ---- Output field "<<i<<" ----\n";
1111     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1112     CeedChk(ierr);
1113     if (emode==CEED_EVAL_GRAD)
1114     {
1115       if (useCollograd) {
1116         //Accumulator for gradient slices
1117         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1118         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1119         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1120         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1121         code << "      }\n";
1122         code << "    }\n";
1123       } else {
1124         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1125       }
1126     }
1127     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1128     {
1129       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1130     }
1131   }
1132   // We treat quadrature points per slice in 3d to save registers
1133   if (useCollograd) {
1134     code << "\n    // Note: Collocated Gradient\n";
1135     code << "#pragma unroll\n";
1136     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1137     code << "      // -- Input fields --\n";
1138     for (CeedInt i = 0; i < numinputfields; i++) {
1139       code << "      // ---- Input field "<<i<<" ----\n";
1140       // Get elemsize, emode, ncomp
1141       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1142       CeedChk(ierr);
1143       // Basis action
1144       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1145       switch (emode) {
1146       case CEED_EVAL_NONE:
1147         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1148 
1149         bool isStrided;
1150         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr);
1151         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr);
1152         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1153         if (!isStrided) {
1154           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1155           CeedChk(ierr);
1156           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1157           CeedInt compstride;
1158           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1159           code << "      // CompStride: "<<compstride<<"\n";
1160           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1161           data->indices.in[i] = restr_data->d_ind;
1162           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1163         } else {
1164           bool backendstrides;
1165           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1166           CeedChk(ierr);
1167           CeedInt nelem;
1168           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1169           CeedChk(ierr);
1170           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1171           if (!backendstrides) {
1172             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1173             CeedChk(ierr);
1174           }
1175           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1176           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1177         }
1178         break;
1179       case CEED_EVAL_INTERP:
1180         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1181         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1182         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1183         code << "      }\n";
1184         break;
1185       case CEED_EVAL_GRAD:
1186         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1187         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1188         break;
1189       case CEED_EVAL_WEIGHT:
1190         code << "      CeedScalar r_q"<<i<<"[1];\n";
1191         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1192         break; // No action
1193       case CEED_EVAL_DIV:
1194         break; // TODO: Not implemented
1195       case CEED_EVAL_CURL:
1196         break; // TODO: Not implemented
1197       }
1198     }
1199     code << "\n      // -- Output fields --\n";
1200     for (CeedInt i = 0; i < numoutputfields; i++) {
1201       code << "      // ---- Output field "<<i<<" ----\n";
1202       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1203       CeedChk(ierr);
1204       // Basis action
1205       switch (emode) {
1206       case CEED_EVAL_NONE:
1207         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1208         break; // No action
1209       case CEED_EVAL_INTERP:
1210         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1211         break;
1212       case CEED_EVAL_GRAD:
1213         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1214         break;
1215       case CEED_EVAL_WEIGHT:
1216         break; // Should not occur
1217       case CEED_EVAL_DIV:
1218         break; // TODO: Not implemented
1219       case CEED_EVAL_CURL:
1220         break; // TODO: Not implemented
1221       }
1222     }
1223   } else {
1224     code << "\n      // Note: No Collocated Gradient\n";
1225     code << "      // -- Input fields --\n";
1226     for (CeedInt i = 0; i < numinputfields; i++) {
1227       code << "      // ---- Input field "<<i<<" ----\n";
1228       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1229     }
1230     code << "      // -- Output fields --\n";
1231     for (CeedInt i = 0; i < numoutputfields; i++) {
1232       code << "      // ---- Output field "<<i<<" ----\n";
1233       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1234     }
1235   }
1236   code << "\n      // -- QFunction Inputs and outputs --\n";
1237   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1238   for (CeedInt i = 0; i < numinputfields; i++) {
1239     code << "      // ---- Input field "<<i<<" ----\n";
1240     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1241   }
1242   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1243   for (CeedInt i = 0; i < numoutputfields; i++) {
1244     code << "      // ---- Output field "<<i<<" ----\n";
1245     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1246   }
1247   code << "\n      // -- Apply QFunction --\n";
1248   code << "      "<<qFunctionName<<"(ctx, ";
1249   if (dim != 3 || useCollograd) {
1250     code << "1";
1251   } else {
1252     code << "Q1d";
1253   }
1254   code << ", in, out);\n";
1255   if (useCollograd) {
1256     code << "\n      // Note: Collocated Gradient\n";
1257     code << "      // -- Output fields --\n";
1258     for (CeedInt i = 0; i < numoutputfields; i++) {
1259       code << "      // ---- Output field "<<i<<" ----\n";
1260       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1261       CeedChk(ierr);
1262       // Basis action
1263       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1264       switch (emode) {
1265       case CEED_EVAL_NONE:
1266         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1267         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1268         code << "      }\n";
1269         break; // No action
1270       case CEED_EVAL_INTERP:
1271         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1272         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1273         code << "      }\n";
1274         break;
1275       case CEED_EVAL_GRAD:
1276         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1277         break;
1278       case CEED_EVAL_WEIGHT:
1279         break; // Should not occur
1280       case CEED_EVAL_DIV:
1281         break; // TODO: Not implemented
1282       case CEED_EVAL_CURL:
1283         break; // TODO: Not implemented
1284       }
1285     }
1286     code << "    }\n";
1287   }
1288 
1289   // Output basis apply if needed
1290   // Generate the correct eval mode code for each output
1291   code << "\n    // -- Output field basis action and restrictions --\n";
1292   for (CeedInt i = 0; i < numoutputfields; i++) {
1293     code << "    // ---- Output field "<<i<<" ----\n";
1294     // Get elemsize, emode, ncomp
1295     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1296     CeedChk(ierr);
1297     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1298     CeedChk(ierr);
1299     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1300     CeedChk(ierr);
1301     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1302     CeedChk(ierr);
1303     // Basis action
1304     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1305     switch (emode) {
1306     case CEED_EVAL_NONE:
1307       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1308       break; // No action
1309     case CEED_EVAL_INTERP:
1310       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1311       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1312       break;
1313     case CEED_EVAL_GRAD:
1314       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1315       if (useCollograd) {
1316         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1317       } else {
1318         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1319       }
1320       break;
1321     // LCOV_EXCL_START
1322     case CEED_EVAL_WEIGHT: {
1323       Ceed ceed;
1324       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1325       return CeedError(ceed, 1,
1326                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1327       break; // Should not occur
1328     }
1329     case CEED_EVAL_DIV:
1330       break; // TODO: Not implemented
1331     case CEED_EVAL_CURL:
1332       break; // TODO: Not implemented
1333       // LCOV_EXCL_STOP
1334     }
1335     // Restriction
1336       bool isStrided;
1337       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1338     if (!isStrided) {
1339       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1340       CeedChk(ierr);
1341       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1342       CeedInt compstride;
1343       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1344       code << "    // CompStride: "<<compstride<<"\n";
1345       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1346       data->indices.out[i] = restr_data->d_ind;
1347       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1348     } else {
1349       bool backendstrides;
1350       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1351       CeedChk(ierr);
1352       CeedInt nelem;
1353       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1354       CeedChk(ierr);
1355       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1356       if (!backendstrides) {
1357         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1358         CeedChk(ierr);
1359       }
1360       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1361       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1362     }
1363   }
1364 
1365   code << "  }\n";
1366   code << "}\n";
1367   code << "// -----------------------------------------------------------------------------\n\n";
1368 
1369   // View kernel for debugging
1370   CeedDebug(code.str().c_str());
1371 
1372   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
1373                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1374   CeedChk(ierr);
1375   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1376   CeedChk(ierr);
1377 
1378   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1379   return 0;
1380 }
1381 //------------------------------------------------------------------------------
1382