xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 6d69246ad8d06846a22b2f7e3720a77f7a77e7f6)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #define CEED_DEBUG_COLOR 12
18 
19 #include <ceed/ceed.h>
20 #include <ceed/backend.h>
21 #include <cuda_runtime.h>
22 #include <iostream>
23 #include <sstream>
24 #include "ceed-cuda-gen.h"
25 #include "../cuda/ceed-cuda-compile.h"
26 #include "../cuda-shared/ceed-cuda-shared.h"
27 
28 static const char *atomicAdd = QUOTE(
29 //------------------------------------------------------------------------------
30 // Atomic add, for older CUDA
31 //------------------------------------------------------------------------------
32 __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) {
33   unsigned long long int *address_as_ull = (unsigned long long int *)address;
34   unsigned long long int old = *address_as_ull, assumed;
35   do {
36     assumed = old;
37     old =
38       atomicCAS(address_as_ull, assumed,
39                 __double_as_longlong(val +
40                                     __longlong_as_double(assumed)));
41     // Note: uses integer comparison to avoid hang in case of NaN
42     // (since NaN != NaN)
43   } while (assumed != old);
44   return __longlong_as_double(old);
45 }
46 );
47 
48 static const char *deviceFunctions = QUOTE(
49 
50 //------------------------------------------------------------------------------
51 // Typedefs
52 //------------------------------------------------------------------------------
53 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
54 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
55 
56 typedef struct {
57   CeedInt tidx;
58   CeedInt tidy;
59   CeedInt tidz;
60   CeedInt tid;
61   CeedScalar* slice;
62 } BackendData;
63 
64 //------------------------------------------------------------------------------
65 // Load matrices for basis actions
66 //------------------------------------------------------------------------------
67 template <int P, int Q>
68 inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
69   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
70     B[i] = d_B[i];
71 }
72 
73 //------------------------------------------------------------------------------
74 // 1D
75 //------------------------------------------------------------------------------
76 
77 //------------------------------------------------------------------------------
78 // L-vector -> E-vector, offsets provided
79 //------------------------------------------------------------------------------
80 template <int NCOMP, int COMPSTRIDE, int P1d>
81 inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
82   if (data.tidx < P1d) {
83     const CeedInt node = data.tidx;
84     const CeedInt ind = indices[node + elem * P1d];
85     for (CeedInt comp = 0; comp < NCOMP; ++comp)
86       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
87   }
88 }
89 
90 //------------------------------------------------------------------------------
91 // L-vector -> E-vector, strided
92 //------------------------------------------------------------------------------
93 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
94 inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
95   if (data.tidx < P1d) {
96     const CeedInt node = data.tidx;
97     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
98     for (CeedInt comp = 0; comp < NCOMP; ++comp)
99       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
100   }
101 }
102 
103 //------------------------------------------------------------------------------
104 // E-vector -> L-vector, offsets provided
105 //------------------------------------------------------------------------------
106 template <int NCOMP, int COMPSTRIDE, int P1d>
107 inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
108   if (data.tidx < P1d) {
109     const CeedInt node = data.tidx;
110     const CeedInt ind = indices[node + elem * P1d];
111     for (CeedInt comp = 0; comp < NCOMP; ++comp)
112       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
113   }
114 }
115 
116 //------------------------------------------------------------------------------
117 // E-vector -> L-vector, strided
118 //------------------------------------------------------------------------------
119 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
120 inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
121   if (data.tidx < P1d) {
122     const CeedInt node = data.tidx;
123     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
124     for (CeedInt comp = 0; comp < NCOMP; ++comp)
125       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
126   }
127 }
128 
129 //------------------------------------------------------------------------------
130 // 1D tensor contraction x
131 //------------------------------------------------------------------------------
132 template <int NCOMP, int P1d, int Q1d>
133 inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
134   data.slice[data.tidx] = *U;
135   __syncthreads();
136   *V = 0.0;
137   if (data.tidx < Q1d)
138     for (CeedInt i = 0; i < P1d; ++i)
139       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
140   __syncthreads();
141 }
142 
143 //------------------------------------------------------------------------------
144 // 1D transpose tensor contraction x
145 //------------------------------------------------------------------------------
146 template <int NCOMP, int P1d, int Q1d>
147 inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
148   data.slice[data.tidx] = *U;
149   __syncthreads();
150   *V = 0.0;
151   if (data.tidx < P1d)
152     for (CeedInt i = 0; i < Q1d; ++i)
153       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
154   __syncthreads();
155 }
156 
157 //------------------------------------------------------------------------------
158 // 1D interpolate to quadrature points
159 //------------------------------------------------------------------------------
160 template <int NCOMP, int P1d, int Q1d>
161 inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
162   for (CeedInt comp = 0; comp < NCOMP; comp++)
163     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
164 }
165 
166 //------------------------------------------------------------------------------
167 // 1D interpolate transpose
168 //------------------------------------------------------------------------------
169 template <int NCOMP, int P1d, int Q1d>
170 inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
171   for (CeedInt comp=0; comp<NCOMP; comp++)
172     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
173 }
174 
175 //------------------------------------------------------------------------------
176 // 1D derivatives at quadrature points
177 //------------------------------------------------------------------------------
178 template <int NCOMP, int P1d, int Q1d>
179 inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
180   for (CeedInt comp = 0; comp < NCOMP; comp++)
181     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
182 }
183 
184 //------------------------------------------------------------------------------
185 // 1D derivatives transpose
186 //------------------------------------------------------------------------------
187 template <int NCOMP, int P1d, int Q1d>
188 inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
189   for (CeedInt comp = 0; comp < NCOMP; comp++)
190     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
191 }
192 
193 //------------------------------------------------------------------------------
194 // 2D
195 //------------------------------------------------------------------------------
196 
197 //------------------------------------------------------------------------------
198 // L-vector -> E-vector, offsets provided
199 //------------------------------------------------------------------------------
200 template <int NCOMP, int COMPSTRIDE, int P1d>
201 inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
202   if (data.tidx < P1d && data.tidy < P1d) {
203     const CeedInt node = data.tidx + data.tidy*P1d;
204     const CeedInt ind = indices[node + elem * P1d*P1d];
205     for (CeedInt comp = 0; comp < NCOMP; ++comp)
206       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
207   }
208 }
209 
210 //------------------------------------------------------------------------------
211 // L-vector -> E-vector, strided
212 //------------------------------------------------------------------------------
213 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
214 inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
215   if (data.tidx < P1d && data.tidy < P1d) {
216     const CeedInt node = data.tidx + data.tidy*P1d;
217     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
218     for (CeedInt comp = 0; comp < NCOMP; ++comp)
219       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
220   }
221 }
222 
223 //------------------------------------------------------------------------------
224 // E-vector -> L-vector, offsets provided
225 //------------------------------------------------------------------------------
226 template <int NCOMP, int COMPSTRIDE, int P1d>
227 inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
228   if (data.tidx < P1d && data.tidy < P1d) {
229     const CeedInt node = data.tidx + data.tidy*P1d;
230     const CeedInt ind = indices[node + elem * P1d*P1d];
231     for (CeedInt comp = 0; comp < NCOMP; ++comp)
232       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
233   }
234 }
235 
236 //------------------------------------------------------------------------------
237 // E-vector -> L-vector, strided
238 //------------------------------------------------------------------------------
239 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
240 inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
241   if (data.tidx < P1d && data.tidy < P1d) {
242     const CeedInt node = data.tidx + data.tidy*P1d;
243     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
244     for (CeedInt comp = 0; comp < NCOMP; ++comp)
245       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
246   }
247 }
248 
249 //------------------------------------------------------------------------------
250 // 2D tensor contraction x
251 //------------------------------------------------------------------------------
252 template <int NCOMP, int P1d, int Q1d>
253 inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
254   data.slice[data.tidx+data.tidy*T1d] = *U;
255   __syncthreads();
256   *V = 0.0;
257   if (data.tidx < Q1d && data.tidy < P1d)
258     for (CeedInt i = 0; i < P1d; ++i)
259       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
260   __syncthreads();
261 }
262 
263 //------------------------------------------------------------------------------
264 // 2D tensor contract y
265 //------------------------------------------------------------------------------
266 template <int NCOMP, int P1d, int Q1d>
267 inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
268   data.slice[data.tidx+data.tidy*T1d] = *U;
269   __syncthreads();
270   *V = 0.0;
271   if (data.tidx < Q1d && data.tidy < Q1d)
272     for (CeedInt i = 0; i < P1d; ++i)
273       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
274   __syncthreads();
275 }
276 
277 //------------------------------------------------------------------------------
278 // 2D transpose tensor contract y
279 //------------------------------------------------------------------------------
280 template <int NCOMP, int P1d, int Q1d>
281 inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
282   data.slice[data.tidx+data.tidy*T1d] = *U;
283   __syncthreads();
284   *V = 0.0;
285   if (data.tidx < Q1d && data.tidy < P1d)
286     for (CeedInt i = 0; i < Q1d; ++i)
287       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
288   __syncthreads();
289 }
290 
291 //------------------------------------------------------------------------------
292 // 2D transpose tensor contract x
293 //------------------------------------------------------------------------------
294 template <int NCOMP, int P1d, int Q1d>
295 inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
296   data.slice[data.tidx+data.tidy*T1d] = *U;
297   __syncthreads();
298   *V = 0.0;
299   if (data.tidx < P1d && data.tidy < P1d)
300     for (CeedInt i = 0; i < Q1d; ++i)
301       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
302   __syncthreads();
303 }
304 
305 //------------------------------------------------------------------------------
306 // 2D transpose tensor contract and add x
307 //------------------------------------------------------------------------------
308 template <int NCOMP, int P1d, int Q1d>
309 inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
310   data.slice[data.tidx+data.tidy*T1d] = *U;
311   __syncthreads();
312   if (data.tidx < P1d && data.tidy < P1d)
313     for (CeedInt i = 0; i < Q1d; ++i)
314       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
315   __syncthreads();
316 }
317 
318 //------------------------------------------------------------------------------
319 // 2D interpolate to quadrature points
320 //------------------------------------------------------------------------------
321 template <int NCOMP, int P1d, int Q1d>
322 inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
323   CeedScalar r_t[1];
324   for (CeedInt comp = 0; comp < NCOMP; comp++) {
325     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
326     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
327   }
328 }
329 
330 //------------------------------------------------------------------------------
331 // 2D interpolate transpose
332 //------------------------------------------------------------------------------
333 template <int NCOMP, int P1d, int Q1d>
334 inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
335   CeedScalar r_t[1];
336   for (CeedInt comp = 0; comp < NCOMP; comp++) {
337     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
338     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
339   }
340 }
341 
342 //------------------------------------------------------------------------------
343 // 2D derivatives at quadrature points
344 //------------------------------------------------------------------------------
345 template <int NCOMP, int P1d, int Q1d>
346 inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
347   CeedScalar r_t[1];
348   for (CeedInt comp = 0; comp < NCOMP; comp++) {
349     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
350     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
351     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
352     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
353   }
354 }
355 
356 //------------------------------------------------------------------------------
357 // 2D derivatives transpose
358 //------------------------------------------------------------------------------
359 template <int NCOMP, int P1d, int Q1d>
360 inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
361   CeedScalar r_t[1];
362   for (CeedInt comp = 0; comp < NCOMP; comp++) {
363     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
364     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
365     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
366     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
367   }
368 }
369 
370 //------------------------------------------------------------------------------
371 // 3D
372 //------------------------------------------------------------------------------
373 
374 //------------------------------------------------------------------------------
375 // L-vector -> E-vector, offsets provided
376 //------------------------------------------------------------------------------
377 template <int NCOMP, int COMPSTRIDE, int P1d>
378 inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
379   if (data.tidx < P1d && data.tidy < P1d)
380     for (CeedInt z = 0; z < P1d; ++z) {
381       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
382       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
383       for (CeedInt comp = 0; comp < NCOMP; ++comp)
384         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
385     }
386 }
387 
388 //------------------------------------------------------------------------------
389 // L-vector -> E-vector, strided
390 //------------------------------------------------------------------------------
391 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
392 inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
393   if (data.tidx < P1d && data.tidy < P1d)
394     for (CeedInt z = 0; z < P1d; ++z) {
395       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
396       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
397       for (CeedInt comp = 0; comp < NCOMP; ++comp)
398         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
399     }
400 }
401 
402 //------------------------------------------------------------------------------
403 // E-vector -> Q-vector, offests provided
404 //------------------------------------------------------------------------------
405 template <int NCOMP, int COMPSTRIDE, int Q1d>
406 inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
407   if (data.tidx < Q1d && data.tidy < Q1d) {
408     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
409     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
410     for (CeedInt comp = 0; comp < NCOMP; ++comp)
411       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
412   }
413 }
414 
415 //------------------------------------------------------------------------------
416 // E-vector -> Q-vector, strided
417 //------------------------------------------------------------------------------
418 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
419 inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
420   if (data.tidx < Q1d && data.tidy < Q1d) {
421     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
422     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
423     for (CeedInt comp = 0; comp < NCOMP; ++comp)
424       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
425   }
426 }
427 
428 //------------------------------------------------------------------------------
429 // E-vector -> L-vector, offsets provided
430 //------------------------------------------------------------------------------
431 template <int NCOMP, int COMPSTRIDE, int P1d>
432 inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
433   if (data.tidx < P1d && data.tidy < P1d)
434     for (CeedInt z = 0; z < P1d; ++z) {
435       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
436       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
437       for (CeedInt comp = 0; comp < NCOMP; ++comp)
438         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
439     }
440 }
441 
442 //------------------------------------------------------------------------------
443 // E-vector -> L-vector, strided
444 //------------------------------------------------------------------------------
445 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
446 inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
447   if (data.tidx < P1d && data.tidy < P1d)
448     for (CeedInt z = 0; z < P1d; ++z) {
449       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
450       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
451       for (CeedInt comp = 0; comp < NCOMP; ++comp)
452         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
453     }
454 }
455 
456 //------------------------------------------------------------------------------
457 // 3D tensor contract x
458 //------------------------------------------------------------------------------
459 template <int NCOMP, int P1d, int Q1d>
460 inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
461   CeedScalar r_B[P1d];
462   for (CeedInt i = 0; i < P1d; ++i)
463     r_B[i] = B[i + data.tidx*P1d];
464 
465   for (CeedInt k = 0; k < P1d; ++k) {
466     data.slice[data.tidx+data.tidy*T1d] = U[k];
467     __syncthreads();
468     V[k] = 0.0;
469     if (data.tidx < Q1d && data.tidy < P1d)
470       for (CeedInt i = 0; i < P1d; ++i)
471         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
472     __syncthreads();
473   }
474 }
475 
476 //------------------------------------------------------------------------------
477 // 3D tensor contract y
478 //------------------------------------------------------------------------------
479 template <int NCOMP, int P1d, int Q1d>
480 inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
481   CeedScalar r_B[P1d];
482   for (CeedInt i = 0; i < P1d; ++i)
483     r_B[i] = B[i + data.tidy*P1d];
484 
485   for (CeedInt k = 0; k < P1d; ++k) {
486     data.slice[data.tidx+data.tidy*T1d] = U[k];
487     __syncthreads();
488     V[k] = 0.0;
489     if (data.tidx < Q1d && data.tidy < Q1d)
490       for (CeedInt i = 0; i < P1d; ++i)
491         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
492     __syncthreads();
493   }
494 }
495 
496 //------------------------------------------------------------------------------
497 // 3D tensor contract z
498 //------------------------------------------------------------------------------
499 template <int NCOMP, int P1d, int Q1d>
500 inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
501   for (CeedInt k = 0; k < Q1d; ++k) {
502     V[k] = 0.0;
503     if (data.tidx < Q1d && data.tidy < Q1d)
504       for (CeedInt i = 0; i < P1d; ++i)
505         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
506   }
507 }
508 
509 //------------------------------------------------------------------------------
510 // 3D transpose tensor contract z
511 //------------------------------------------------------------------------------
512 template <int NCOMP, int P1d, int Q1d>
513 inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
514   for (CeedInt k = 0; k < P1d; ++k) {
515     V[k] = 0.0;
516     if (data.tidx < Q1d && data.tidy < Q1d)
517       for (CeedInt i = 0; i < Q1d; ++i)
518         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
519   }
520 }
521 
522 //------------------------------------------------------------------------------
523 // 3D transpose tensor contract y
524 //------------------------------------------------------------------------------
525 template <int NCOMP, int P1d, int Q1d>
526 inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
527   CeedScalar r_B[Q1d];
528   for (CeedInt i = 0; i < Q1d; ++i)
529     r_B[i] = B[data.tidy + i*P1d];
530 
531   for (CeedInt k = 0; k < P1d; ++k) {
532     data.slice[data.tidx+data.tidy*T1d] = U[k];
533     __syncthreads();
534     V[k] = 0.0;
535     if (data.tidx < Q1d && data.tidy < P1d)
536       for (CeedInt i = 0; i < Q1d; ++i)
537         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
538     __syncthreads();
539   }
540 }
541 
542 //------------------------------------------------------------------------------
543 // 3D transpose tensor contract add y
544 //------------------------------------------------------------------------------
545 template <int NCOMP, int P1d, int Q1d>
546 inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
547   CeedScalar r_B[Q1d];
548   for (CeedInt i = 0; i < Q1d; ++i)
549     r_B[i] = B[data.tidy + i*P1d];
550 
551   for (CeedInt k = 0; k < P1d; ++k) {
552     data.slice[data.tidx+data.tidy*T1d] = U[k];
553     __syncthreads();
554     if (data.tidx < Q1d && data.tidy < P1d)
555       for (CeedInt i = 0; i < Q1d; ++i)
556         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
557     __syncthreads();
558   }
559 }
560 
561 //------------------------------------------------------------------------------
562 // 3D transpose tensor contract x
563 //------------------------------------------------------------------------------
564 template <int NCOMP, int P1d, int Q1d>
565 inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
566   CeedScalar r_B[Q1d];
567   for (CeedInt i = 0; i < Q1d; ++i)
568     r_B[i] = B[data.tidx + i*P1d];
569 
570   for (CeedInt k = 0; k < P1d; ++k) {
571     data.slice[data.tidx+data.tidy*T1d] = U[k];
572     __syncthreads();
573     V[k] = 0.0;
574     if (data.tidx < P1d && data.tidy < P1d)
575       for (CeedInt i = 0; i < Q1d; ++i)
576         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
577     __syncthreads();
578   }
579 }
580 
581 //------------------------------------------------------------------------------
582 // 3D transpose tensor contract add x
583 //------------------------------------------------------------------------------
584 template <int NCOMP, int P1d, int Q1d>
585 inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
586   CeedScalar r_B[Q1d];
587   for (CeedInt i = 0; i < Q1d; ++i)
588     r_B[i] = B[data.tidx + i*P1d];
589 
590   for (CeedInt k = 0; k < P1d; ++k) {
591     data.slice[data.tidx+data.tidy*T1d] = U[k];
592     __syncthreads();
593     if (data.tidx < P1d && data.tidy < P1d)
594       for (CeedInt i = 0; i < Q1d; ++i)
595         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
596     __syncthreads();
597   }
598 }
599 
600 //------------------------------------------------------------------------------
601 // 3D interpolate to quadrature points
602 //------------------------------------------------------------------------------
603 template <int NCOMP, int P1d, int Q1d>
604 inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
605   CeedScalar r_t1[T1d];
606   CeedScalar r_t2[T1d];
607   for (CeedInt comp = 0; comp < NCOMP; comp++) {
608     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
609     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
610     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
611   }
612 }
613 
614 //------------------------------------------------------------------------------
615 // 3D interpolate transpose
616 //------------------------------------------------------------------------------
617 template <int NCOMP, int P1d, int Q1d>
618 inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
619   CeedScalar r_t1[T1d];
620   CeedScalar r_t2[T1d];
621   for (CeedInt comp = 0; comp < NCOMP; comp++) {
622     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
623     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
624     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
625   }
626 }
627 
628 //------------------------------------------------------------------------------
629 // 3D derivatives at quadrature points
630 //------------------------------------------------------------------------------
631 template <int NCOMP, int P1d, int Q1d>
632 inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
633   CeedScalar r_t1[T1d];
634   CeedScalar r_t2[T1d];
635   for (CeedInt comp = 0; comp < NCOMP; comp++) {
636     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
637     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
638     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
639     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
640     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
641     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
642     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
643     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
644     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
645   }
646 }
647 
648 //------------------------------------------------------------------------------
649 // 3D derivatives transpose
650 //------------------------------------------------------------------------------
651 template <int NCOMP, int P1d, int Q1d>
652 inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
653   CeedScalar r_t1[T1d];
654   CeedScalar r_t2[T1d];
655   for (CeedInt comp = 0; comp < NCOMP; comp++) {
656     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
657     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
658     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
659     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
660     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
661     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
662     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
663     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
664     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
665   }
666 }
667 
668 //------------------------------------------------------------------------------
669 // 3D collocated derivatives computation
670 //------------------------------------------------------------------------------
671 template <int NCOMP, int Q1d>
672 inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
673   if (data.tidx < Q1d && data.tidy < Q1d) {
674     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
675       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
676       __syncthreads();
677       // X derivative
678       r_V[comp+0*NCOMP] = 0.0;
679       for (CeedInt i = 0; i < Q1d; ++i)
680         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
681       // Y derivative
682       r_V[comp+1*NCOMP] = 0.0;
683       for (CeedInt i = 0; i < Q1d; ++i)
684         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
685       // Z derivative
686       r_V[comp+2*NCOMP] = 0.0;
687       for (CeedInt i = 0; i < Q1d; ++i)
688         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
689       __syncthreads();
690     }
691   }
692 }
693 
694 //------------------------------------------------------------------------------
695 // 3D collocated derivatives transpose
696 //------------------------------------------------------------------------------
697 template <int NCOMP, int Q1d>
698 inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
699   if (data.tidx < Q1d && data.tidy < Q1d) {
700     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
701       // X derivative
702       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
703       __syncthreads();
704       for (CeedInt i = 0; i < Q1d; ++i)
705         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
706       __syncthreads();
707       // Y derivative
708       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
709       __syncthreads();
710       for (CeedInt i = 0; i < Q1d; ++i)
711         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
712       __syncthreads();
713       // Z derivative
714       for (CeedInt i = 0; i < Q1d; ++i)
715         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
716     }
717   }
718 }
719 
720 //------------------------------------------------------------------------------
721 // 1D quadrature weights
722 //------------------------------------------------------------------------------
723 template <int Q1d>
724 inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
725   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
726 }
727 
728 //------------------------------------------------------------------------------
729 // 2D quadrature weights
730 //------------------------------------------------------------------------------
731 template <int Q1d>
732 inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
733   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
734         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
735 }
736 
737 //------------------------------------------------------------------------------
738 // 3D quadrature weights
739 //------------------------------------------------------------------------------
740 template <int Q1d>
741 inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
742   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
743   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
744   for (CeedInt z = 0; z < Q1d; ++z)
745     w[z] = quad ? pw*qweight1d[z] : 0.0;
746 }
747 
748 );
749 //------------------------------------------------------------------------------
750 // Build singe operator kernel
751 //------------------------------------------------------------------------------
752 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
753 
754   using std::ostringstream;
755   using std::string;
756   int ierr;
757   bool setupdone;
758   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
759   if (setupdone) return CEED_ERROR_SUCCESS;
760   Ceed ceed;
761   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
762   CeedOperator_Cuda_gen *data;
763   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
764   CeedQFunction qf;
765   CeedQFunction_Cuda_gen *qf_data;
766   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
767   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
768   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
769           numoutputfields, ncomp, dim = 0, lsize;
770   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
771   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
772   CeedOperatorField *opinputfields, *opoutputfields;
773   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
774   CeedChkBackend(ierr);
775   CeedQFunctionField *qfinputfields, *qfoutputfields;
776   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
777   CeedChkBackend(ierr);
778   CeedEvalMode emode;
779   CeedBasis basis;
780   CeedBasis_Cuda_shared *basis_data;
781   CeedElemRestriction Erestrict;
782   CeedElemRestriction_Cuda *restr_data;
783 
784   // Check for restriction only identity operator
785   bool is_identity_qf;
786   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
787   if (is_identity_qf) {
788     CeedEvalMode emodein, emodeout;
789     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
790     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
791     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
792       // LCOV_EXCL_START
793       return CeedError(ceed, CEED_ERROR_BACKEND,
794                        "Backend does not implement restriction only identity operators");
795     // LCOV_EXCL_STOP
796   }
797 
798   ostringstream code;
799   string devFunctions(deviceFunctions);
800 
801   // Add atomicAdd function for old NVidia architectures
802   struct cudaDeviceProp prop;
803   Ceed_Cuda *ceed_data;
804   ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr);
805   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); CeedChkBackend(ierr);
806   if ((prop.major<6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){
807     code << atomicAdd;
808   }
809 
810   code << devFunctions;
811 
812   string qFunction(qf_data->qFunctionSource);
813   string qFunctionName(qf_data->qFunctionName);
814   string oper;
815   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
816 
817   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
818   code << "#define CEED_QFUNCTION_HELPER inline __device__\n";
819   code << "#define CeedPragmaSIMD\n";
820   code << "#define CEED_ERROR_SUCCESS 0\n\n";
821 
822   // Find dim and Q1d
823   bool useCollograd = true;
824   data->maxP1d = 0;
825   for (CeedInt i = 0; i < numinputfields; i++) {
826     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
827     if (basis != CEED_BASIS_COLLOCATED) {
828       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
829       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
830       CeedChkBackend(ierr);
831 
832       // Check for collocated gradient
833       useCollograd = useCollograd && basis_data->d_collograd1d;
834 
835       // Collect dim and Q1d
836       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
837       bool isTensor;
838       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
839       if (isTensor) {
840         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
841         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
842         if (P1d>data->maxP1d) data->maxP1d = P1d;
843       } else {
844         // LCOV_EXCL_START
845         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
846         // LCOV_EXCL_STOP
847         }
848     }
849   }
850   // Check output bases for Q1d, dim as well
851   //   The only imput basis might be CEED_BASIS_COLLOCATED
852   for (CeedInt i = 0; i < numoutputfields; i++) {
853     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
854 
855     if (basis != CEED_BASIS_COLLOCATED) {
856       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
857       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
858       CeedChkBackend(ierr);
859 
860       // Collect dim and Q1d
861       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
862       bool isTensor;
863       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
864       if (isTensor) {
865         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
866       } else {
867         // LCOV_EXCL_START
868         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
869         // LCOV_EXCL_STOP
870         }
871 
872       // Check for collocated gradient
873       useCollograd = useCollograd && basis_data->d_collograd1d;
874     }
875   }
876   data->dim = dim;
877   data->Q1d = Q1d;
878 
879   // Define CEED_Q_VLA
880   if (dim != 3 || useCollograd) {
881     code << "\n#define CEED_Q_VLA 1\n\n";
882   } else {
883     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
884   }
885 
886   code << qFunction;
887 
888   // Setup
889   code << "\n// -----------------------------------------------------------------------------\n";
890   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
891   for (CeedInt i = 0; i < numinputfields; i++) {
892     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
893     CeedChkBackend(ierr);
894     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
895       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
896     }
897   }
898 
899   for (CeedInt i = 0; i < numoutputfields; i++) {
900     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
901   }
902 
903   code << "  const CeedInt Dim = "<<dim<<";\n";
904   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
905 
906   code << "  extern __shared__ CeedScalar slice[];\n";
907   code << "  BackendData data;\n";
908   code << "  data.tidx = threadIdx.x;\n";
909   code << "  data.tidy = threadIdx.y;\n";
910   code << "  data.tidz = threadIdx.z;\n";
911   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
912   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
913 
914   code << "\n  // -- Input field constants and basis data --\n";
915   //Initialize constants, and matrices B and G
916   for (CeedInt i = 0; i < numinputfields; i++) {
917     code << "  // ---- Input field "<<i<<" ----\n";
918     // Get elemsize, emode, ncomp
919     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
920     CeedChkBackend(ierr);
921     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
922     CeedChkBackend(ierr);
923     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
924     CeedChkBackend(ierr);
925     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
926     CeedChkBackend(ierr);
927 
928     // Set field constants
929     if (emode != CEED_EVAL_WEIGHT) {
930       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
931       if (basis != CEED_BASIS_COLLOCATED) {
932         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
933         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
934       } else {
935         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
936       }
937       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
938     }
939 
940     // Load basis data
941     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
942     switch (emode) {
943     case CEED_EVAL_NONE:
944       break;
945     case CEED_EVAL_INTERP:
946       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
947       data->B.in[i] = basis_data->d_interp1d;
948       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
949       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
950       break;
951     case CEED_EVAL_GRAD:
952       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
953       data->B.in[i] = basis_data->d_interp1d;
954       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
955       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
956       if (useCollograd) {
957         data->G.in[i] = basis_data->d_collograd1d;
958         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
959         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
960       } else {
961         data->G.in[i] = basis_data->d_grad1d;
962         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
963         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
964       }
965       break;
966     case CEED_EVAL_WEIGHT:
967       break; // No action
968     case CEED_EVAL_DIV:
969       break; // TODO: Not implemented
970     case CEED_EVAL_CURL:
971       break; // TODO: Not implemented
972     }
973   }
974 
975   code << "\n  // -- Output field constants and basis data --\n";
976   for (CeedInt i = 0; i < numoutputfields; i++) {
977     code << "  // ---- Output field "<<i<<" ----\n";
978     // Get elemsize, emode, ncomp
979     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
980     CeedChkBackend(ierr);
981     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
982     CeedChkBackend(ierr);
983     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
984     CeedChkBackend(ierr);
985     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
986     CeedChkBackend(ierr);
987 
988     // Set field constants
989     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
990     if (basis != CEED_BASIS_COLLOCATED) {
991       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
992       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
993     } else {
994       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
995     }
996     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
997 
998     // Load basis data
999     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1000     switch (emode) {
1001     case CEED_EVAL_NONE:
1002       break; // No action
1003     case CEED_EVAL_INTERP:
1004       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1005       data->B.out[i] = basis_data->d_interp1d;
1006       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1007       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1008       break;
1009     case CEED_EVAL_GRAD:
1010       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1011       data->B.out[i] = basis_data->d_interp1d;
1012       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1013       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1014       if (useCollograd) {
1015         data->G.out[i] = basis_data->d_collograd1d;
1016         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1017         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1018       } else {
1019         data->G.out[i] = basis_data->d_grad1d;
1020         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1021         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1022       }
1023       break;
1024     // LCOV_EXCL_START
1025     case CEED_EVAL_WEIGHT: {
1026       Ceed ceed;
1027       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1028       return CeedError(ceed, CEED_ERROR_BACKEND,
1029                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1030       break; // Should not occur
1031     }
1032     case CEED_EVAL_DIV:
1033       break; // TODO: Not implemented
1034     case CEED_EVAL_CURL:
1035       break; // TODO: Not implemented
1036       // LCOV_EXCL_STOP
1037     }
1038   }
1039   code << "\n  // -- Element loop --\n";
1040   code << "  __syncthreads();\n";
1041   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1042   // Input basis apply if needed
1043   // Generate the correct eval mode code for each input
1044   code << "    // -- Input field restrictions and basis actions --\n";
1045   for (CeedInt i = 0; i < numinputfields; i++) {
1046     code << "    // ---- Input field "<<i<<" ----\n";
1047     // Get elemsize, emode, ncomp
1048     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1049     CeedChkBackend(ierr);
1050     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1051     CeedChkBackend(ierr);
1052     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1053     CeedChkBackend(ierr);
1054     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1055     CeedChkBackend(ierr);
1056 
1057     // Restriction
1058     if (emode != CEED_EVAL_WEIGHT &&
1059         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1060       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1061 
1062       bool isStrided;
1063       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1064       if (!isStrided) {
1065         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1066         CeedChkBackend(ierr);
1067         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1068         CeedInt compstride;
1069         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1070         code << "    // CompStride: "<<compstride<<"\n";
1071         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1072         data->indices.in[i] = restr_data->d_ind;
1073         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1074       } else {
1075         bool backendstrides;
1076         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1077         CeedChkBackend(ierr);
1078         CeedInt nelem;
1079         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1080         CeedChkBackend(ierr);
1081         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1082         if (!backendstrides) {
1083           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1084           CeedChkBackend(ierr);
1085         }
1086         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1087         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1088       }
1089     }
1090 
1091     // Basis action
1092     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1093     switch (emode) {
1094     case CEED_EVAL_NONE:
1095       if (!useCollograd) {
1096         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1097       }
1098       break;
1099     case CEED_EVAL_INTERP:
1100       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1101       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1102       break;
1103     case CEED_EVAL_GRAD:
1104       if (useCollograd) {
1105         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1106         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1107       } else {
1108         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1109         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1110       }
1111       break;
1112     case CEED_EVAL_WEIGHT:
1113       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1114       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1115       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1116       data->W = basis_data->d_qweight1d;
1117       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1118       break; // No action
1119     case CEED_EVAL_DIV:
1120       break; // TODO: Not implemented
1121     case CEED_EVAL_CURL:
1122       break; // TODO: Not implemented
1123     }
1124   }
1125 
1126   // Q function
1127   code << "\n    // -- Output field setup --\n";
1128   for (CeedInt i = 0; i < numoutputfields; i++) {
1129       code << "\n    // ---- Output field "<<i<<" ----\n";
1130     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1131     CeedChkBackend(ierr);
1132     if (emode==CEED_EVAL_GRAD)
1133     {
1134       if (useCollograd) {
1135         //Accumulator for gradient slices
1136         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1137         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1138         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1139         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1140         code << "      }\n";
1141         code << "    }\n";
1142       } else {
1143         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1144       }
1145     }
1146     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1147     {
1148       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1149     }
1150   }
1151   // We treat quadrature points per slice in 3d to save registers
1152   if (useCollograd) {
1153     code << "\n    // Note: Collocated Gradient\n";
1154     code << "#pragma unroll\n";
1155     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1156     code << "      // -- Input fields --\n";
1157     for (CeedInt i = 0; i < numinputfields; i++) {
1158       code << "      // ---- Input field "<<i<<" ----\n";
1159       // Get elemsize, emode, ncomp
1160       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1161       CeedChkBackend(ierr);
1162       // Basis action
1163       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1164       switch (emode) {
1165       case CEED_EVAL_NONE:
1166         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1167 
1168         bool isStrided;
1169         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1170         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1171         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1172         if (!isStrided) {
1173           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1174           CeedChkBackend(ierr);
1175           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1176           CeedInt compstride;
1177           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1178           code << "      // CompStride: "<<compstride<<"\n";
1179           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1180           data->indices.in[i] = restr_data->d_ind;
1181           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1182         } else {
1183           bool backendstrides;
1184           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1185           CeedChkBackend(ierr);
1186           CeedInt nelem;
1187           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1188           CeedChkBackend(ierr);
1189           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1190           if (!backendstrides) {
1191             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1192             CeedChkBackend(ierr);
1193           }
1194           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1195           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1196         }
1197         break;
1198       case CEED_EVAL_INTERP:
1199         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1200         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1201         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1202         code << "      }\n";
1203         break;
1204       case CEED_EVAL_GRAD:
1205         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1206         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1207         break;
1208       case CEED_EVAL_WEIGHT:
1209         code << "      CeedScalar r_q"<<i<<"[1];\n";
1210         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1211         break; // No action
1212       case CEED_EVAL_DIV:
1213         break; // TODO: Not implemented
1214       case CEED_EVAL_CURL:
1215         break; // TODO: Not implemented
1216       }
1217     }
1218     code << "\n      // -- Output fields --\n";
1219     for (CeedInt i = 0; i < numoutputfields; i++) {
1220       code << "      // ---- Output field "<<i<<" ----\n";
1221       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1222       CeedChkBackend(ierr);
1223       // Basis action
1224       switch (emode) {
1225       case CEED_EVAL_NONE:
1226         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1227         break; // No action
1228       case CEED_EVAL_INTERP:
1229         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1230         break;
1231       case CEED_EVAL_GRAD:
1232         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1233         break;
1234       case CEED_EVAL_WEIGHT:
1235         break; // Should not occur
1236       case CEED_EVAL_DIV:
1237         break; // TODO: Not implemented
1238       case CEED_EVAL_CURL:
1239         break; // TODO: Not implemented
1240       }
1241     }
1242   } else {
1243     code << "\n      // Note: No Collocated Gradient\n";
1244     code << "      // -- Input fields --\n";
1245     for (CeedInt i = 0; i < numinputfields; i++) {
1246       code << "      // ---- Input field "<<i<<" ----\n";
1247       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1248     }
1249     code << "      // -- Output fields --\n";
1250     for (CeedInt i = 0; i < numoutputfields; i++) {
1251       code << "      // ---- Output field "<<i<<" ----\n";
1252       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1253     }
1254   }
1255   code << "\n      // -- QFunction Inputs and outputs --\n";
1256   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1257   for (CeedInt i = 0; i < numinputfields; i++) {
1258     code << "      // ---- Input field "<<i<<" ----\n";
1259     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1260   }
1261   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1262   for (CeedInt i = 0; i < numoutputfields; i++) {
1263     code << "      // ---- Output field "<<i<<" ----\n";
1264     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1265   }
1266   code << "\n      // -- Apply QFunction --\n";
1267   code << "      "<<qFunctionName<<"(ctx, ";
1268   if (dim != 3 || useCollograd) {
1269     code << "1";
1270   } else {
1271     code << "Q1d";
1272   }
1273   code << ", in, out);\n";
1274   if (useCollograd) {
1275     code << "\n      // Note: Collocated Gradient\n";
1276     code << "      // -- Output fields --\n";
1277     for (CeedInt i = 0; i < numoutputfields; i++) {
1278       code << "      // ---- Output field "<<i<<" ----\n";
1279       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1280       CeedChkBackend(ierr);
1281       // Basis action
1282       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1283       switch (emode) {
1284       case CEED_EVAL_NONE:
1285         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1286         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1287         code << "      }\n";
1288         break; // No action
1289       case CEED_EVAL_INTERP:
1290         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1291         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1292         code << "      }\n";
1293         break;
1294       case CEED_EVAL_GRAD:
1295         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1296         break;
1297       case CEED_EVAL_WEIGHT:
1298         break; // Should not occur
1299       case CEED_EVAL_DIV:
1300         break; // TODO: Not implemented
1301       case CEED_EVAL_CURL:
1302         break; // TODO: Not implemented
1303       }
1304     }
1305     code << "    }\n";
1306   }
1307 
1308   // Output basis apply if needed
1309   // Generate the correct eval mode code for each output
1310   code << "\n    // -- Output field basis action and restrictions --\n";
1311   for (CeedInt i = 0; i < numoutputfields; i++) {
1312     code << "    // ---- Output field "<<i<<" ----\n";
1313     // Get elemsize, emode, ncomp
1314     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1315     CeedChkBackend(ierr);
1316     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1317     CeedChkBackend(ierr);
1318     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1319     CeedChkBackend(ierr);
1320     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1321     CeedChkBackend(ierr);
1322     // Basis action
1323     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1324     switch (emode) {
1325     case CEED_EVAL_NONE:
1326       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1327       break; // No action
1328     case CEED_EVAL_INTERP:
1329       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1330       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1331       break;
1332     case CEED_EVAL_GRAD:
1333       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1334       if (useCollograd) {
1335         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1336       } else {
1337         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1338       }
1339       break;
1340     // LCOV_EXCL_START
1341     case CEED_EVAL_WEIGHT: {
1342       Ceed ceed;
1343       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1344       return CeedError(ceed, CEED_ERROR_BACKEND,
1345                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1346       break; // Should not occur
1347     }
1348     case CEED_EVAL_DIV:
1349       break; // TODO: Not implemented
1350     case CEED_EVAL_CURL:
1351       break; // TODO: Not implemented
1352       // LCOV_EXCL_STOP
1353     }
1354     // Restriction
1355       bool isStrided;
1356       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1357     if (!isStrided) {
1358       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1359       CeedChkBackend(ierr);
1360       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1361       CeedInt compstride;
1362       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1363       code << "    // CompStride: "<<compstride<<"\n";
1364       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1365       data->indices.out[i] = restr_data->d_ind;
1366       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1367     } else {
1368       bool backendstrides;
1369       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1370       CeedChkBackend(ierr);
1371       CeedInt nelem;
1372       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1373       CeedChkBackend(ierr);
1374       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1375       if (!backendstrides) {
1376         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1377         CeedChkBackend(ierr);
1378       }
1379       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1380       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1381     }
1382   }
1383 
1384   code << "  }\n";
1385   code << "}\n";
1386   code << "// -----------------------------------------------------------------------------\n\n";
1387 
1388   // View kernel for debugging
1389   CeedDebug(ceed, code.str().c_str());
1390 
1391   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
1392                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1393   CeedChkBackend(ierr);
1394   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1395   CeedChkBackend(ierr);
1396 
1397   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1398   return CEED_ERROR_SUCCESS;
1399 }
1400 //------------------------------------------------------------------------------
1401