xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 17be3a414c6fae47654f1361bae9c9dbcdd66795)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed/ceed.h>
11 #include <ceed/backend.h>
12 #include <cuda_runtime.h>
13 #include <iostream>
14 #include <sstream>
15 #include "ceed-cuda-gen.h"
16 #include "../cuda/ceed-cuda-compile.h"
17 #include "../cuda-ref/ceed-cuda-ref.h"
18 #include "../cuda-shared/ceed-cuda-shared.h"
19 
20 static const char *atomicAdd = QUOTE(
21 //------------------------------------------------------------------------------
22 // Atomic add, for older CUDA
23 //------------------------------------------------------------------------------
24 __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) {
25   unsigned long long int *address_as_ull = (unsigned long long int *)address;
26   unsigned long long int old = *address_as_ull, assumed;
27   do {
28     assumed = old;
29     old =
30       atomicCAS(address_as_ull, assumed,
31                 __double_as_longlong(val +
32                                     __longlong_as_double(assumed)));
33     // Note: uses integer comparison to avoid hang in case of NaN
34     // (since NaN != NaN)
35   } while (assumed != old);
36   return __longlong_as_double(old);
37 }
38 );
39 
40 static const char *deviceFunctions = QUOTE(
41 
42 //------------------------------------------------------------------------------
43 // Typedefs
44 //------------------------------------------------------------------------------
45 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
46 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
47 
48 typedef struct {
49   CeedInt tidx;
50   CeedInt tidy;
51   CeedInt tidz;
52   CeedInt tid;
53   CeedScalar* slice;
54 } BackendData;
55 
56 //------------------------------------------------------------------------------
57 // Load matrices for basis actions
58 //------------------------------------------------------------------------------
59 template <int P, int Q>
60 inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
61   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
62     B[i] = d_B[i];
63 }
64 
65 //------------------------------------------------------------------------------
66 // 1D
67 //------------------------------------------------------------------------------
68 
69 //------------------------------------------------------------------------------
70 // L-vector -> E-vector, offsets provided
71 //------------------------------------------------------------------------------
72 template <int NCOMP, int COMPSTRIDE, int P1d>
73 inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
74   if (data.tidx < P1d) {
75     const CeedInt node = data.tidx;
76     const CeedInt ind = indices[node + elem * P1d];
77     for (CeedInt comp = 0; comp < NCOMP; ++comp)
78       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
79   }
80 }
81 
82 //------------------------------------------------------------------------------
83 // L-vector -> E-vector, strided
84 //------------------------------------------------------------------------------
85 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
86 inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
87   if (data.tidx < P1d) {
88     const CeedInt node = data.tidx;
89     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
90     for (CeedInt comp = 0; comp < NCOMP; ++comp)
91       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
92   }
93 }
94 
95 //------------------------------------------------------------------------------
96 // E-vector -> L-vector, offsets provided
97 //------------------------------------------------------------------------------
98 template <int NCOMP, int COMPSTRIDE, int P1d>
99 inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
100   if (data.tidx < P1d) {
101     const CeedInt node = data.tidx;
102     const CeedInt ind = indices[node + elem * P1d];
103     for (CeedInt comp = 0; comp < NCOMP; ++comp)
104       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
105   }
106 }
107 
108 //------------------------------------------------------------------------------
109 // E-vector -> L-vector, strided
110 //------------------------------------------------------------------------------
111 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
112 inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
113   if (data.tidx < P1d) {
114     const CeedInt node = data.tidx;
115     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
116     for (CeedInt comp = 0; comp < NCOMP; ++comp)
117       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
118   }
119 }
120 
121 //------------------------------------------------------------------------------
122 // 1D tensor contraction x
123 //------------------------------------------------------------------------------
124 template <int NCOMP, int P1d, int Q1d>
125 inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
126   data.slice[data.tidx] = *U;
127   __syncthreads();
128   *V = 0.0;
129   if (data.tidx < Q1d)
130     for (CeedInt i = 0; i < P1d; ++i)
131       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
132   __syncthreads();
133 }
134 
135 //------------------------------------------------------------------------------
136 // 1D transpose tensor contraction x
137 //------------------------------------------------------------------------------
138 template <int NCOMP, int P1d, int Q1d>
139 inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
140   data.slice[data.tidx] = *U;
141   __syncthreads();
142   *V = 0.0;
143   if (data.tidx < P1d)
144     for (CeedInt i = 0; i < Q1d; ++i)
145       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
146   __syncthreads();
147 }
148 
149 //------------------------------------------------------------------------------
150 // 1D interpolate to quadrature points
151 //------------------------------------------------------------------------------
152 template <int NCOMP, int P1d, int Q1d>
153 inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
154   for (CeedInt comp = 0; comp < NCOMP; comp++)
155     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
156 }
157 
158 //------------------------------------------------------------------------------
159 // 1D interpolate transpose
160 //------------------------------------------------------------------------------
161 template <int NCOMP, int P1d, int Q1d>
162 inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
163   for (CeedInt comp=0; comp<NCOMP; comp++)
164     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
165 }
166 
167 //------------------------------------------------------------------------------
168 // 1D derivatives at quadrature points
169 //------------------------------------------------------------------------------
170 template <int NCOMP, int P1d, int Q1d>
171 inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
172   for (CeedInt comp = 0; comp < NCOMP; comp++)
173     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
174 }
175 
176 //------------------------------------------------------------------------------
177 // 1D derivatives transpose
178 //------------------------------------------------------------------------------
179 template <int NCOMP, int P1d, int Q1d>
180 inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
181   for (CeedInt comp = 0; comp < NCOMP; comp++)
182     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
183 }
184 
185 //------------------------------------------------------------------------------
186 // 2D
187 //------------------------------------------------------------------------------
188 
189 //------------------------------------------------------------------------------
190 // L-vector -> E-vector, offsets provided
191 //------------------------------------------------------------------------------
192 template <int NCOMP, int COMPSTRIDE, int P1d>
193 inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
194   if (data.tidx < P1d && data.tidy < P1d) {
195     const CeedInt node = data.tidx + data.tidy*P1d;
196     const CeedInt ind = indices[node + elem * P1d*P1d];
197     for (CeedInt comp = 0; comp < NCOMP; ++comp)
198       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
199   }
200 }
201 
202 //------------------------------------------------------------------------------
203 // L-vector -> E-vector, strided
204 //------------------------------------------------------------------------------
205 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
206 inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
207   if (data.tidx < P1d && data.tidy < P1d) {
208     const CeedInt node = data.tidx + data.tidy*P1d;
209     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
210     for (CeedInt comp = 0; comp < NCOMP; ++comp)
211       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
212   }
213 }
214 
215 //------------------------------------------------------------------------------
216 // E-vector -> L-vector, offsets provided
217 //------------------------------------------------------------------------------
218 template <int NCOMP, int COMPSTRIDE, int P1d>
219 inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
220   if (data.tidx < P1d && data.tidy < P1d) {
221     const CeedInt node = data.tidx + data.tidy*P1d;
222     const CeedInt ind = indices[node + elem * P1d*P1d];
223     for (CeedInt comp = 0; comp < NCOMP; ++comp)
224       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
225   }
226 }
227 
228 //------------------------------------------------------------------------------
229 // E-vector -> L-vector, strided
230 //------------------------------------------------------------------------------
231 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
232 inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
233   if (data.tidx < P1d && data.tidy < P1d) {
234     const CeedInt node = data.tidx + data.tidy*P1d;
235     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
236     for (CeedInt comp = 0; comp < NCOMP; ++comp)
237       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
238   }
239 }
240 
241 //------------------------------------------------------------------------------
242 // 2D tensor contraction x
243 //------------------------------------------------------------------------------
244 template <int NCOMP, int P1d, int Q1d>
245 inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
246   data.slice[data.tidx+data.tidy*T1d] = *U;
247   __syncthreads();
248   *V = 0.0;
249   if (data.tidx < Q1d && data.tidy < P1d)
250     for (CeedInt i = 0; i < P1d; ++i)
251       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
252   __syncthreads();
253 }
254 
255 //------------------------------------------------------------------------------
256 // 2D tensor contract y
257 //------------------------------------------------------------------------------
258 template <int NCOMP, int P1d, int Q1d>
259 inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
260   data.slice[data.tidx+data.tidy*T1d] = *U;
261   __syncthreads();
262   *V = 0.0;
263   if (data.tidx < Q1d && data.tidy < Q1d)
264     for (CeedInt i = 0; i < P1d; ++i)
265       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
266   __syncthreads();
267 }
268 
269 //------------------------------------------------------------------------------
270 // 2D transpose tensor contract y
271 //------------------------------------------------------------------------------
272 template <int NCOMP, int P1d, int Q1d>
273 inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
274   data.slice[data.tidx+data.tidy*T1d] = *U;
275   __syncthreads();
276   *V = 0.0;
277   if (data.tidx < Q1d && data.tidy < P1d)
278     for (CeedInt i = 0; i < Q1d; ++i)
279       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
280   __syncthreads();
281 }
282 
283 //------------------------------------------------------------------------------
284 // 2D transpose tensor contract x
285 //------------------------------------------------------------------------------
286 template <int NCOMP, int P1d, int Q1d>
287 inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
288   data.slice[data.tidx+data.tidy*T1d] = *U;
289   __syncthreads();
290   *V = 0.0;
291   if (data.tidx < P1d && data.tidy < P1d)
292     for (CeedInt i = 0; i < Q1d; ++i)
293       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
294   __syncthreads();
295 }
296 
297 //------------------------------------------------------------------------------
298 // 2D transpose tensor contract and add x
299 //------------------------------------------------------------------------------
300 template <int NCOMP, int P1d, int Q1d>
301 inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
302   data.slice[data.tidx+data.tidy*T1d] = *U;
303   __syncthreads();
304   if (data.tidx < P1d && data.tidy < P1d)
305     for (CeedInt i = 0; i < Q1d; ++i)
306       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
307   __syncthreads();
308 }
309 
310 //------------------------------------------------------------------------------
311 // 2D interpolate to quadrature points
312 //------------------------------------------------------------------------------
313 template <int NCOMP, int P1d, int Q1d>
314 inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
315   CeedScalar r_t[1];
316   for (CeedInt comp = 0; comp < NCOMP; comp++) {
317     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
318     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
319   }
320 }
321 
322 //------------------------------------------------------------------------------
323 // 2D interpolate transpose
324 //------------------------------------------------------------------------------
325 template <int NCOMP, int P1d, int Q1d>
326 inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
327   CeedScalar r_t[1];
328   for (CeedInt comp = 0; comp < NCOMP; comp++) {
329     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
330     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
331   }
332 }
333 
334 //------------------------------------------------------------------------------
335 // 2D derivatives at quadrature points
336 //------------------------------------------------------------------------------
337 template <int NCOMP, int P1d, int Q1d>
338 inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
339   CeedScalar r_t[1];
340   for (CeedInt comp = 0; comp < NCOMP; comp++) {
341     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
342     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
343     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
344     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
345   }
346 }
347 
348 //------------------------------------------------------------------------------
349 // 2D derivatives transpose
350 //------------------------------------------------------------------------------
351 template <int NCOMP, int P1d, int Q1d>
352 inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
353   CeedScalar r_t[1];
354   for (CeedInt comp = 0; comp < NCOMP; comp++) {
355     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
356     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
357     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
358     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
359   }
360 }
361 
362 //------------------------------------------------------------------------------
363 // 3D
364 //------------------------------------------------------------------------------
365 
366 //------------------------------------------------------------------------------
367 // L-vector -> E-vector, offsets provided
368 //------------------------------------------------------------------------------
369 template <int NCOMP, int COMPSTRIDE, int P1d>
370 inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
371   if (data.tidx < P1d && data.tidy < P1d)
372     for (CeedInt z = 0; z < P1d; ++z) {
373       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
374       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
375       for (CeedInt comp = 0; comp < NCOMP; ++comp)
376         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
377     }
378 }
379 
380 //------------------------------------------------------------------------------
381 // L-vector -> E-vector, strided
382 //------------------------------------------------------------------------------
383 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
384 inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
385   if (data.tidx < P1d && data.tidy < P1d)
386     for (CeedInt z = 0; z < P1d; ++z) {
387       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
388       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
389       for (CeedInt comp = 0; comp < NCOMP; ++comp)
390         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
391     }
392 }
393 
394 //------------------------------------------------------------------------------
395 // E-vector -> Q-vector, offests provided
396 //------------------------------------------------------------------------------
397 template <int NCOMP, int COMPSTRIDE, int Q1d>
398 inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
399   if (data.tidx < Q1d && data.tidy < Q1d) {
400     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
401     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
402     for (CeedInt comp = 0; comp < NCOMP; ++comp)
403       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
404   }
405 }
406 
407 //------------------------------------------------------------------------------
408 // E-vector -> Q-vector, strided
409 //------------------------------------------------------------------------------
410 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
411 inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
412   if (data.tidx < Q1d && data.tidy < Q1d) {
413     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
414     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
415     for (CeedInt comp = 0; comp < NCOMP; ++comp)
416       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
417   }
418 }
419 
420 //------------------------------------------------------------------------------
421 // E-vector -> L-vector, offsets provided
422 //------------------------------------------------------------------------------
423 template <int NCOMP, int COMPSTRIDE, int P1d>
424 inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
425   if (data.tidx < P1d && data.tidy < P1d)
426     for (CeedInt z = 0; z < P1d; ++z) {
427       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
428       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
429       for (CeedInt comp = 0; comp < NCOMP; ++comp)
430         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
431     }
432 }
433 
434 //------------------------------------------------------------------------------
435 // E-vector -> L-vector, strided
436 //------------------------------------------------------------------------------
437 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
438 inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
439   if (data.tidx < P1d && data.tidy < P1d)
440     for (CeedInt z = 0; z < P1d; ++z) {
441       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
442       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
443       for (CeedInt comp = 0; comp < NCOMP; ++comp)
444         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
445     }
446 }
447 
448 //------------------------------------------------------------------------------
449 // 3D tensor contract x
450 //------------------------------------------------------------------------------
451 template <int NCOMP, int P1d, int Q1d>
452 inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
453   CeedScalar r_B[P1d];
454   for (CeedInt i = 0; i < P1d; ++i)
455     r_B[i] = B[i + data.tidx*P1d];
456 
457   for (CeedInt k = 0; k < P1d; ++k) {
458     data.slice[data.tidx+data.tidy*T1d] = U[k];
459     __syncthreads();
460     V[k] = 0.0;
461     if (data.tidx < Q1d && data.tidy < P1d)
462       for (CeedInt i = 0; i < P1d; ++i)
463         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
464     __syncthreads();
465   }
466 }
467 
468 //------------------------------------------------------------------------------
469 // 3D tensor contract y
470 //------------------------------------------------------------------------------
471 template <int NCOMP, int P1d, int Q1d>
472 inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
473   CeedScalar r_B[P1d];
474   for (CeedInt i = 0; i < P1d; ++i)
475     r_B[i] = B[i + data.tidy*P1d];
476 
477   for (CeedInt k = 0; k < P1d; ++k) {
478     data.slice[data.tidx+data.tidy*T1d] = U[k];
479     __syncthreads();
480     V[k] = 0.0;
481     if (data.tidx < Q1d && data.tidy < Q1d)
482       for (CeedInt i = 0; i < P1d; ++i)
483         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
484     __syncthreads();
485   }
486 }
487 
488 //------------------------------------------------------------------------------
489 // 3D tensor contract z
490 //------------------------------------------------------------------------------
491 template <int NCOMP, int P1d, int Q1d>
492 inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
493   for (CeedInt k = 0; k < Q1d; ++k) {
494     V[k] = 0.0;
495     if (data.tidx < Q1d && data.tidy < Q1d)
496       for (CeedInt i = 0; i < P1d; ++i)
497         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
498   }
499 }
500 
501 //------------------------------------------------------------------------------
502 // 3D transpose tensor contract z
503 //------------------------------------------------------------------------------
504 template <int NCOMP, int P1d, int Q1d>
505 inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
506   for (CeedInt k = 0; k < P1d; ++k) {
507     V[k] = 0.0;
508     if (data.tidx < Q1d && data.tidy < Q1d)
509       for (CeedInt i = 0; i < Q1d; ++i)
510         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
511   }
512 }
513 
514 //------------------------------------------------------------------------------
515 // 3D transpose tensor contract y
516 //------------------------------------------------------------------------------
517 template <int NCOMP, int P1d, int Q1d>
518 inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
519   CeedScalar r_B[Q1d];
520   for (CeedInt i = 0; i < Q1d; ++i)
521     r_B[i] = B[data.tidy + i*P1d];
522 
523   for (CeedInt k = 0; k < P1d; ++k) {
524     data.slice[data.tidx+data.tidy*T1d] = U[k];
525     __syncthreads();
526     V[k] = 0.0;
527     if (data.tidx < Q1d && data.tidy < P1d)
528       for (CeedInt i = 0; i < Q1d; ++i)
529         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
530     __syncthreads();
531   }
532 }
533 
534 //------------------------------------------------------------------------------
535 // 3D transpose tensor contract add y
536 //------------------------------------------------------------------------------
537 template <int NCOMP, int P1d, int Q1d>
538 inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
539   CeedScalar r_B[Q1d];
540   for (CeedInt i = 0; i < Q1d; ++i)
541     r_B[i] = B[data.tidy + i*P1d];
542 
543   for (CeedInt k = 0; k < P1d; ++k) {
544     data.slice[data.tidx+data.tidy*T1d] = U[k];
545     __syncthreads();
546     if (data.tidx < Q1d && data.tidy < P1d)
547       for (CeedInt i = 0; i < Q1d; ++i)
548         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
549     __syncthreads();
550   }
551 }
552 
553 //------------------------------------------------------------------------------
554 // 3D transpose tensor contract x
555 //------------------------------------------------------------------------------
556 template <int NCOMP, int P1d, int Q1d>
557 inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
558   CeedScalar r_B[Q1d];
559   for (CeedInt i = 0; i < Q1d; ++i)
560     r_B[i] = B[data.tidx + i*P1d];
561 
562   for (CeedInt k = 0; k < P1d; ++k) {
563     data.slice[data.tidx+data.tidy*T1d] = U[k];
564     __syncthreads();
565     V[k] = 0.0;
566     if (data.tidx < P1d && data.tidy < P1d)
567       for (CeedInt i = 0; i < Q1d; ++i)
568         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
569     __syncthreads();
570   }
571 }
572 
573 //------------------------------------------------------------------------------
574 // 3D transpose tensor contract add x
575 //------------------------------------------------------------------------------
576 template <int NCOMP, int P1d, int Q1d>
577 inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
578   CeedScalar r_B[Q1d];
579   for (CeedInt i = 0; i < Q1d; ++i)
580     r_B[i] = B[data.tidx + i*P1d];
581 
582   for (CeedInt k = 0; k < P1d; ++k) {
583     data.slice[data.tidx+data.tidy*T1d] = U[k];
584     __syncthreads();
585     if (data.tidx < P1d && data.tidy < P1d)
586       for (CeedInt i = 0; i < Q1d; ++i)
587         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
588     __syncthreads();
589   }
590 }
591 
592 //------------------------------------------------------------------------------
593 // 3D interpolate to quadrature points
594 //------------------------------------------------------------------------------
595 template <int NCOMP, int P1d, int Q1d>
596 inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
597   CeedScalar r_t1[T1d];
598   CeedScalar r_t2[T1d];
599   for (CeedInt comp = 0; comp < NCOMP; comp++) {
600     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
601     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
602     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
603   }
604 }
605 
606 //------------------------------------------------------------------------------
607 // 3D interpolate transpose
608 //------------------------------------------------------------------------------
609 template <int NCOMP, int P1d, int Q1d>
610 inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
611   CeedScalar r_t1[T1d];
612   CeedScalar r_t2[T1d];
613   for (CeedInt comp = 0; comp < NCOMP; comp++) {
614     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
615     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
616     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
617   }
618 }
619 
620 //------------------------------------------------------------------------------
621 // 3D derivatives at quadrature points
622 //------------------------------------------------------------------------------
623 template <int NCOMP, int P1d, int Q1d>
624 inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
625   CeedScalar r_t1[T1d];
626   CeedScalar r_t2[T1d];
627   for (CeedInt comp = 0; comp < NCOMP; comp++) {
628     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
629     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
630     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
631     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
632     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
633     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
634     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
635     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
636     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
637   }
638 }
639 
640 //------------------------------------------------------------------------------
641 // 3D derivatives transpose
642 //------------------------------------------------------------------------------
643 template <int NCOMP, int P1d, int Q1d>
644 inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
645   CeedScalar r_t1[T1d];
646   CeedScalar r_t2[T1d];
647   for (CeedInt comp = 0; comp < NCOMP; comp++) {
648     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
649     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
650     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
651     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
652     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
653     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
654     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
655     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
656     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
657   }
658 }
659 
660 //------------------------------------------------------------------------------
661 // 3D collocated derivatives computation
662 //------------------------------------------------------------------------------
663 template <int NCOMP, int Q1d>
664 inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
665   if (data.tidx < Q1d && data.tidy < Q1d) {
666     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
667       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
668       __syncthreads();
669       // X derivative
670       r_V[comp+0*NCOMP] = 0.0;
671       for (CeedInt i = 0; i < Q1d; ++i)
672         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
673       // Y derivative
674       r_V[comp+1*NCOMP] = 0.0;
675       for (CeedInt i = 0; i < Q1d; ++i)
676         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
677       // Z derivative
678       r_V[comp+2*NCOMP] = 0.0;
679       for (CeedInt i = 0; i < Q1d; ++i)
680         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
681       __syncthreads();
682     }
683   }
684 }
685 
686 //------------------------------------------------------------------------------
687 // 3D collocated derivatives transpose
688 //------------------------------------------------------------------------------
689 template <int NCOMP, int Q1d>
690 inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
691   if (data.tidx < Q1d && data.tidy < Q1d) {
692     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
693       // X derivative
694       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
695       __syncthreads();
696       for (CeedInt i = 0; i < Q1d; ++i)
697         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
698       __syncthreads();
699       // Y derivative
700       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
701       __syncthreads();
702       for (CeedInt i = 0; i < Q1d; ++i)
703         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
704       __syncthreads();
705       // Z derivative
706       for (CeedInt i = 0; i < Q1d; ++i)
707         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
708     }
709   }
710 }
711 
712 //------------------------------------------------------------------------------
713 // 1D quadrature weights
714 //------------------------------------------------------------------------------
715 template <int Q1d>
716 inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
717   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
718 }
719 
720 //------------------------------------------------------------------------------
721 // 2D quadrature weights
722 //------------------------------------------------------------------------------
723 template <int Q1d>
724 inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
725   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
726         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
727 }
728 
729 //------------------------------------------------------------------------------
730 // 3D quadrature weights
731 //------------------------------------------------------------------------------
732 template <int Q1d>
733 inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
734   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
735   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
736   for (CeedInt z = 0; z < Q1d; ++z)
737     w[z] = quad ? pw*qweight1d[z] : 0.0;
738 }
739 
740 );
741 //------------------------------------------------------------------------------
742 // Build singe operator kernel
743 //------------------------------------------------------------------------------
744 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
745 
746   using std::ostringstream;
747   using std::string;
748   int ierr;
749   bool setupdone;
750   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
751   if (setupdone) return CEED_ERROR_SUCCESS;
752   Ceed ceed;
753   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
754   CeedOperator_Cuda_gen *data;
755   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
756   CeedQFunction qf;
757   CeedQFunction_Cuda_gen *qf_data;
758   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
759   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
760   CeedSize lsize;
761   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
762           numoutputfields, ncomp, dim = 1;
763   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
764   Q1d = Q;
765   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
766   CeedOperatorField *opinputfields, *opoutputfields;
767   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
768   CeedChkBackend(ierr);
769   CeedQFunctionField *qfinputfields, *qfoutputfields;
770   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
771   CeedChkBackend(ierr);
772   CeedEvalMode emode;
773   CeedBasis basis;
774   CeedBasis_Cuda_shared *basis_data;
775   CeedElemRestriction Erestrict;
776   CeedElemRestriction_Cuda *restr_data;
777 
778   // Check for restriction only identity operator
779   bool is_identity_qf;
780   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
781   if (is_identity_qf) {
782     CeedEvalMode emodein, emodeout;
783     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
784     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
785     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
786       // LCOV_EXCL_START
787       return CeedError(ceed, CEED_ERROR_BACKEND,
788                        "Backend does not implement restriction only identity operators");
789     // LCOV_EXCL_STOP
790   }
791 
792   ostringstream code;
793   string devFunctions(deviceFunctions);
794 
795   // Add atomicAdd function for old NVidia architectures
796   struct cudaDeviceProp prop;
797   Ceed_Cuda *ceed_data;
798   ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr);
799   ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr);
800   if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){
801     code << atomicAdd;
802   }
803 
804   code << devFunctions;
805 
806   string qFunction(qf_data->qFunctionSource);
807   string qFunctionName(qf_data->qFunctionName);
808   string oper;
809   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
810 
811   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
812   code << "#define CEED_QFUNCTION_HELPER inline __device__\n";
813   code << "#define CeedPragmaSIMD\n";
814   code << "#define CEED_ERROR_SUCCESS 0\n\n";
815 
816   // Find dim and Q1d
817   bool useCollograd = true;
818   data->maxP1d = 0;
819   for (CeedInt i = 0; i < numinputfields; i++) {
820     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
821     if (basis != CEED_BASIS_COLLOCATED) {
822       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
823       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
824       CeedChkBackend(ierr);
825 
826       // Check for collocated gradient
827       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
828 
829       // Collect dim and Q1d
830       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
831       bool isTensor;
832       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
833       if (isTensor) {
834         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
835         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
836         if (P1d>data->maxP1d) data->maxP1d = P1d;
837       } else {
838         // LCOV_EXCL_START
839         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
840         // LCOV_EXCL_STOP
841         }
842     }
843   }
844   // Check output bases for Q1d, dim as well
845   //   The only imput basis might be CEED_BASIS_COLLOCATED
846   for (CeedInt i = 0; i < numoutputfields; i++) {
847     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
848 
849     if (basis != CEED_BASIS_COLLOCATED) {
850       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
851       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
852       CeedChkBackend(ierr);
853 
854       // Collect dim and Q1d
855       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
856       bool isTensor;
857       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
858       if (isTensor) {
859         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
860       } else {
861         // LCOV_EXCL_START
862         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
863         // LCOV_EXCL_STOP
864         }
865 
866       // Check for collocated gradient
867       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
868     }
869   }
870   data->dim = dim;
871   data->Q1d = Q1d;
872 
873   // Define CEED_Q_VLA
874   if (dim != 3 || useCollograd) {
875     code << "\n#define CEED_Q_VLA 1\n\n";
876   } else {
877     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
878   }
879 
880   code << qFunction;
881 
882   // Setup
883   code << "\n// -----------------------------------------------------------------------------\n";
884   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
885   for (CeedInt i = 0; i < numinputfields; i++) {
886     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
887     CeedChkBackend(ierr);
888     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
889       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
890     }
891   }
892 
893   for (CeedInt i = 0; i < numoutputfields; i++) {
894     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
895   }
896 
897   code << "  const CeedInt Dim = "<<dim<<";\n";
898   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
899 
900   code << "  extern __shared__ CeedScalar slice[];\n";
901   code << "  BackendData data;\n";
902   code << "  data.tidx = threadIdx.x;\n";
903   code << "  data.tidy = threadIdx.y;\n";
904   code << "  data.tidz = threadIdx.z;\n";
905   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
906   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
907 
908   code << "\n  // -- Input field constants and basis data --\n";
909   //Initialize constants, and matrices B and G
910   for (CeedInt i = 0; i < numinputfields; i++) {
911     code << "  // ---- Input field "<<i<<" ----\n";
912     // Get elemsize, emode, ncomp
913     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
914     CeedChkBackend(ierr);
915     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
916     CeedChkBackend(ierr);
917     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
918     CeedChkBackend(ierr);
919     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
920     CeedChkBackend(ierr);
921 
922     // Set field constants
923     if (emode != CEED_EVAL_WEIGHT) {
924       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
925       if (basis != CEED_BASIS_COLLOCATED) {
926         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
927         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
928       } else {
929         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
930       }
931       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
932     }
933 
934     // Load basis data
935     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
936     switch (emode) {
937     case CEED_EVAL_NONE:
938       break;
939     case CEED_EVAL_INTERP:
940       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
941       data->B.in[i] = basis_data->d_interp_1d;
942       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
943       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
944       break;
945     case CEED_EVAL_GRAD:
946       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
947       data->B.in[i] = basis_data->d_interp_1d;
948       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
949       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
950       if (useCollograd) {
951         data->G.in[i] = basis_data->d_collo_grad_1d;
952         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
953         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
954       } else {
955         data->G.in[i] = basis_data->d_grad_1d;
956         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
957         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
958       }
959       break;
960     case CEED_EVAL_WEIGHT:
961       break; // No action
962     case CEED_EVAL_DIV:
963       break; // TODO: Not implemented
964     case CEED_EVAL_CURL:
965       break; // TODO: Not implemented
966     }
967   }
968 
969   code << "\n  // -- Output field constants and basis data --\n";
970   for (CeedInt i = 0; i < numoutputfields; i++) {
971     code << "  // ---- Output field "<<i<<" ----\n";
972     // Get elemsize, emode, ncomp
973     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
974     CeedChkBackend(ierr);
975     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
976     CeedChkBackend(ierr);
977     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
978     CeedChkBackend(ierr);
979     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
980     CeedChkBackend(ierr);
981 
982     // Set field constants
983     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
984     if (basis != CEED_BASIS_COLLOCATED) {
985       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
986       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
987     } else {
988       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
989     }
990     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
991 
992     // Load basis data
993     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
994     switch (emode) {
995     case CEED_EVAL_NONE:
996       break; // No action
997     case CEED_EVAL_INTERP:
998       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
999       data->B.out[i] = basis_data->d_interp_1d;
1000       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1001       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1002       break;
1003     case CEED_EVAL_GRAD:
1004       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1005       data->B.out[i] = basis_data->d_interp_1d;
1006       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1007       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1008       if (useCollograd) {
1009         data->G.out[i] = basis_data->d_collo_grad_1d;
1010         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1011         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1012       } else {
1013         data->G.out[i] = basis_data->d_grad_1d;
1014         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1015         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1016       }
1017       break;
1018     // LCOV_EXCL_START
1019     case CEED_EVAL_WEIGHT: {
1020       Ceed ceed;
1021       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1022       return CeedError(ceed, CEED_ERROR_BACKEND,
1023                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1024       break; // Should not occur
1025     }
1026     case CEED_EVAL_DIV:
1027       break; // TODO: Not implemented
1028     case CEED_EVAL_CURL:
1029       break; // TODO: Not implemented
1030       // LCOV_EXCL_STOP
1031     }
1032   }
1033   code << "\n  // -- Element loop --\n";
1034   code << "  __syncthreads();\n";
1035   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1036   // Input basis apply if needed
1037   // Generate the correct eval mode code for each input
1038   code << "    // -- Input field restrictions and basis actions --\n";
1039   for (CeedInt i = 0; i < numinputfields; i++) {
1040     code << "    // ---- Input field "<<i<<" ----\n";
1041     // Get elemsize, emode, ncomp
1042     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1043     CeedChkBackend(ierr);
1044     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1045     CeedChkBackend(ierr);
1046     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1047     CeedChkBackend(ierr);
1048     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1049     CeedChkBackend(ierr);
1050 
1051     // Restriction
1052     if (emode != CEED_EVAL_WEIGHT &&
1053         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1054       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1055 
1056       bool isStrided;
1057       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1058       if (!isStrided) {
1059         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1060         CeedChkBackend(ierr);
1061         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1062         CeedInt compstride;
1063         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1064         code << "    // CompStride: "<<compstride<<"\n";
1065         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1066         data->indices.in[i] = restr_data->d_ind;
1067         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1068       } else {
1069         bool backendstrides;
1070         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1071         CeedChkBackend(ierr);
1072         CeedInt nelem;
1073         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1074         CeedChkBackend(ierr);
1075         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1076         if (!backendstrides) {
1077           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1078           CeedChkBackend(ierr);
1079         }
1080         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1081         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1082       }
1083     }
1084 
1085     // Basis action
1086     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1087     switch (emode) {
1088     case CEED_EVAL_NONE:
1089       if (!useCollograd) {
1090         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1091       }
1092       break;
1093     case CEED_EVAL_INTERP:
1094       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1095       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1096       break;
1097     case CEED_EVAL_GRAD:
1098       if (useCollograd) {
1099         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1100         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1101       } else {
1102         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1103         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1104       }
1105       break;
1106     case CEED_EVAL_WEIGHT:
1107       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1108       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1109       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1110       data->W = basis_data->d_q_weight_1d;
1111       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1112       break; // No action
1113     case CEED_EVAL_DIV:
1114       break; // TODO: Not implemented
1115     case CEED_EVAL_CURL:
1116       break; // TODO: Not implemented
1117     }
1118   }
1119 
1120   // Q function
1121   code << "\n    // -- Output field setup --\n";
1122   for (CeedInt i = 0; i < numoutputfields; i++) {
1123       code << "\n    // ---- Output field "<<i<<" ----\n";
1124     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1125     CeedChkBackend(ierr);
1126     if (emode==CEED_EVAL_GRAD)
1127     {
1128       if (useCollograd) {
1129         //Accumulator for gradient slices
1130         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1131         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1132         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1133         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1134         code << "      }\n";
1135         code << "    }\n";
1136       } else {
1137         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1138       }
1139     }
1140     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1141     {
1142       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1143     }
1144   }
1145   // We treat quadrature points per slice in 3d to save registers
1146   if (useCollograd) {
1147     code << "\n    // Note: Collocated Gradient\n";
1148     code << "#pragma unroll\n";
1149     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1150     code << "      // -- Input fields --\n";
1151     for (CeedInt i = 0; i < numinputfields; i++) {
1152       code << "      // ---- Input field "<<i<<" ----\n";
1153       // Get elemsize, emode, ncomp
1154       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1155       CeedChkBackend(ierr);
1156       // Basis action
1157       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1158       switch (emode) {
1159       case CEED_EVAL_NONE:
1160         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1161 
1162         bool isStrided;
1163         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1164         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1165         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1166         if (!isStrided) {
1167           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1168           CeedChkBackend(ierr);
1169           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1170           CeedInt compstride;
1171           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1172           code << "      // CompStride: "<<compstride<<"\n";
1173           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1174           data->indices.in[i] = restr_data->d_ind;
1175           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1176         } else {
1177           bool backendstrides;
1178           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1179           CeedChkBackend(ierr);
1180           CeedInt nelem;
1181           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1182           CeedChkBackend(ierr);
1183           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1184           if (!backendstrides) {
1185             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1186             CeedChkBackend(ierr);
1187           }
1188           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1189           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1190         }
1191         break;
1192       case CEED_EVAL_INTERP:
1193         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1194         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1195         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1196         code << "      }\n";
1197         break;
1198       case CEED_EVAL_GRAD:
1199         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1200         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1201         break;
1202       case CEED_EVAL_WEIGHT:
1203         code << "      CeedScalar r_q"<<i<<"[1];\n";
1204         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1205         break; // No action
1206       case CEED_EVAL_DIV:
1207         break; // TODO: Not implemented
1208       case CEED_EVAL_CURL:
1209         break; // TODO: Not implemented
1210       }
1211     }
1212     code << "\n      // -- Output fields --\n";
1213     for (CeedInt i = 0; i < numoutputfields; i++) {
1214       code << "      // ---- Output field "<<i<<" ----\n";
1215       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1216       CeedChkBackend(ierr);
1217       // Basis action
1218       switch (emode) {
1219       case CEED_EVAL_NONE:
1220         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1221         break; // No action
1222       case CEED_EVAL_INTERP:
1223         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1224         break;
1225       case CEED_EVAL_GRAD:
1226         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1227         break;
1228       case CEED_EVAL_WEIGHT:
1229         break; // Should not occur
1230       case CEED_EVAL_DIV:
1231         break; // TODO: Not implemented
1232       case CEED_EVAL_CURL:
1233         break; // TODO: Not implemented
1234       }
1235     }
1236   } else {
1237     code << "\n      // Note: No Collocated Gradient\n";
1238     code << "      // -- Input fields --\n";
1239     for (CeedInt i = 0; i < numinputfields; i++) {
1240       code << "      // ---- Input field "<<i<<" ----\n";
1241       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1242     }
1243     code << "      // -- Output fields --\n";
1244     for (CeedInt i = 0; i < numoutputfields; i++) {
1245       code << "      // ---- Output field "<<i<<" ----\n";
1246       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1247     }
1248   }
1249   code << "\n      // -- QFunction Inputs and outputs --\n";
1250   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1251   for (CeedInt i = 0; i < numinputfields; i++) {
1252     code << "      // ---- Input field "<<i<<" ----\n";
1253     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1254   }
1255   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1256   for (CeedInt i = 0; i < numoutputfields; i++) {
1257     code << "      // ---- Output field "<<i<<" ----\n";
1258     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1259   }
1260   code << "\n      // -- Apply QFunction --\n";
1261   code << "      "<<qFunctionName<<"(ctx, ";
1262   if (dim != 3 || useCollograd) {
1263     code << "1";
1264   } else {
1265     code << "Q1d";
1266   }
1267   code << ", in, out);\n";
1268   if (useCollograd) {
1269     code << "\n      // Note: Collocated Gradient\n";
1270     code << "      // -- Output fields --\n";
1271     for (CeedInt i = 0; i < numoutputfields; i++) {
1272       code << "      // ---- Output field "<<i<<" ----\n";
1273       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1274       CeedChkBackend(ierr);
1275       // Basis action
1276       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1277       switch (emode) {
1278       case CEED_EVAL_NONE:
1279         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1280         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1281         code << "      }\n";
1282         break; // No action
1283       case CEED_EVAL_INTERP:
1284         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1285         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1286         code << "      }\n";
1287         break;
1288       case CEED_EVAL_GRAD:
1289         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1290         break;
1291       case CEED_EVAL_WEIGHT:
1292         break; // Should not occur
1293       case CEED_EVAL_DIV:
1294         break; // TODO: Not implemented
1295       case CEED_EVAL_CURL:
1296         break; // TODO: Not implemented
1297       }
1298     }
1299     code << "    }\n";
1300   }
1301 
1302   // Output basis apply if needed
1303   // Generate the correct eval mode code for each output
1304   code << "\n    // -- Output field basis action and restrictions --\n";
1305   for (CeedInt i = 0; i < numoutputfields; i++) {
1306     code << "    // ---- Output field "<<i<<" ----\n";
1307     // Get elemsize, emode, ncomp
1308     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1309     CeedChkBackend(ierr);
1310     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1311     CeedChkBackend(ierr);
1312     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1313     CeedChkBackend(ierr);
1314     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1315     CeedChkBackend(ierr);
1316     // Basis action
1317     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1318     switch (emode) {
1319     case CEED_EVAL_NONE:
1320       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1321       break; // No action
1322     case CEED_EVAL_INTERP:
1323       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1324       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1325       break;
1326     case CEED_EVAL_GRAD:
1327       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1328       if (useCollograd) {
1329         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1330       } else {
1331         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1332       }
1333       break;
1334     // LCOV_EXCL_START
1335     case CEED_EVAL_WEIGHT: {
1336       Ceed ceed;
1337       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1338       return CeedError(ceed, CEED_ERROR_BACKEND,
1339                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1340       break; // Should not occur
1341     }
1342     case CEED_EVAL_DIV:
1343       break; // TODO: Not implemented
1344     case CEED_EVAL_CURL:
1345       break; // TODO: Not implemented
1346       // LCOV_EXCL_STOP
1347     }
1348     // Restriction
1349       bool isStrided;
1350       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1351     if (!isStrided) {
1352       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1353       CeedChkBackend(ierr);
1354       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1355       CeedInt compstride;
1356       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1357       code << "    // CompStride: "<<compstride<<"\n";
1358       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1359       data->indices.out[i] = restr_data->d_ind;
1360       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1361     } else {
1362       bool backendstrides;
1363       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1364       CeedChkBackend(ierr);
1365       CeedInt nelem;
1366       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1367       CeedChkBackend(ierr);
1368       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1369       if (!backendstrides) {
1370         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1371         CeedChkBackend(ierr);
1372       }
1373       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1374       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1375     }
1376   }
1377 
1378   code << "  }\n";
1379   code << "}\n";
1380   code << "// -----------------------------------------------------------------------------\n\n";
1381 
1382   // View kernel for debugging
1383   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
1384   CeedDebug(ceed, code.str().c_str());
1385 
1386   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
1387                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1388   CeedChkBackend(ierr);
1389   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1390   CeedChkBackend(ierr);
1391 
1392   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1393   return CEED_ERROR_SUCCESS;
1394 }
1395 //------------------------------------------------------------------------------
1396