xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 86c7fc999aa022469f08d96430426945f8749adb)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed/ceed.h>
11 #include <ceed/backend.h>
12 #include <cuda_runtime.h>
13 #include <iostream>
14 #include <sstream>
15 #include "ceed-cuda-gen.h"
16 #include "../cuda/ceed-cuda-compile.h"
17 #include "../cuda-ref/ceed-cuda-ref.h"
18 #include "../cuda-shared/ceed-cuda-shared.h"
19 
20 static const char *atomicAdd = QUOTE(
21 //------------------------------------------------------------------------------
22 // Atomic add, for older CUDA
23 //------------------------------------------------------------------------------
24 __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) {
25   unsigned long long int *address_as_ull = (unsigned long long int *)address;
26   unsigned long long int old = *address_as_ull, assumed;
27   do {
28     assumed = old;
29     old =
30       atomicCAS(address_as_ull, assumed,
31                 __double_as_longlong(val +
32                                     __longlong_as_double(assumed)));
33     // Note: uses integer comparison to avoid hang in case of NaN
34     // (since NaN != NaN)
35   } while (assumed != old);
36   return __longlong_as_double(old);
37 }
38 );
39 
40 static const char *deviceFunctions = QUOTE(
41 
42 //------------------------------------------------------------------------------
43 // Typedefs
44 //------------------------------------------------------------------------------
45 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
46 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
47 
48 typedef struct {
49   CeedInt tidx;
50   CeedInt tidy;
51   CeedInt tidz;
52   CeedInt tid;
53   CeedScalar* slice;
54 } BackendData;
55 
56 //------------------------------------------------------------------------------
57 // Load matrices for basis actions
58 //------------------------------------------------------------------------------
59 template <int P, int Q>
60 inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
61   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
62     B[i] = d_B[i];
63 }
64 
65 //------------------------------------------------------------------------------
66 // 1D
67 //------------------------------------------------------------------------------
68 
69 //------------------------------------------------------------------------------
70 // L-vector -> E-vector, offsets provided
71 //------------------------------------------------------------------------------
72 template <int NCOMP, int COMPSTRIDE, int P1d>
73 inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
74   if (data.tidx < P1d) {
75     const CeedInt node = data.tidx;
76     const CeedInt ind = indices[node + elem * P1d];
77     for (CeedInt comp = 0; comp < NCOMP; ++comp)
78       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
79   }
80 }
81 
82 //------------------------------------------------------------------------------
83 // L-vector -> E-vector, strided
84 //------------------------------------------------------------------------------
85 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
86 inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
87   if (data.tidx < P1d) {
88     const CeedInt node = data.tidx;
89     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
90     for (CeedInt comp = 0; comp < NCOMP; ++comp)
91       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
92   }
93 }
94 
95 //------------------------------------------------------------------------------
96 // E-vector -> L-vector, offsets provided
97 //------------------------------------------------------------------------------
98 template <int NCOMP, int COMPSTRIDE, int P1d>
99 inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
100   if (data.tidx < P1d) {
101     const CeedInt node = data.tidx;
102     const CeedInt ind = indices[node + elem * P1d];
103     for (CeedInt comp = 0; comp < NCOMP; ++comp)
104       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
105   }
106 }
107 
108 //------------------------------------------------------------------------------
109 // E-vector -> L-vector, strided
110 //------------------------------------------------------------------------------
111 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
112 inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
113   if (data.tidx < P1d) {
114     const CeedInt node = data.tidx;
115     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
116     for (CeedInt comp = 0; comp < NCOMP; ++comp)
117       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
118   }
119 }
120 
121 //------------------------------------------------------------------------------
122 // 1D tensor contraction x
123 //------------------------------------------------------------------------------
124 template <int NCOMP, int P1d, int Q1d>
125 inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
126   data.slice[data.tidx] = *U;
127   __syncthreads();
128   *V = 0.0;
129   if (data.tidx < Q1d)
130     for (CeedInt i = 0; i < P1d; ++i)
131       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
132   __syncthreads();
133 }
134 
135 //------------------------------------------------------------------------------
136 // 1D transpose tensor contraction x
137 //------------------------------------------------------------------------------
138 template <int NCOMP, int P1d, int Q1d>
139 inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
140   data.slice[data.tidx] = *U;
141   __syncthreads();
142   *V = 0.0;
143   if (data.tidx < P1d)
144     for (CeedInt i = 0; i < Q1d; ++i)
145       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
146   __syncthreads();
147 }
148 
149 //------------------------------------------------------------------------------
150 // 1D interpolate to quadrature points
151 //------------------------------------------------------------------------------
152 template <int NCOMP, int P1d, int Q1d>
153 inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
154   for (CeedInt comp = 0; comp < NCOMP; comp++)
155     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
156 }
157 
158 //------------------------------------------------------------------------------
159 // 1D interpolate transpose
160 //------------------------------------------------------------------------------
161 template <int NCOMP, int P1d, int Q1d>
162 inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
163   for (CeedInt comp=0; comp<NCOMP; comp++)
164     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
165 }
166 
167 //------------------------------------------------------------------------------
168 // 1D derivatives at quadrature points
169 //------------------------------------------------------------------------------
170 template <int NCOMP, int P1d, int Q1d>
171 inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
172   for (CeedInt comp = 0; comp < NCOMP; comp++)
173     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
174 }
175 
176 //------------------------------------------------------------------------------
177 // 1D derivatives transpose
178 //------------------------------------------------------------------------------
179 template <int NCOMP, int P1d, int Q1d>
180 inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
181   for (CeedInt comp = 0; comp < NCOMP; comp++)
182     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
183 }
184 
185 //------------------------------------------------------------------------------
186 // 2D
187 //------------------------------------------------------------------------------
188 
189 //------------------------------------------------------------------------------
190 // L-vector -> E-vector, offsets provided
191 //------------------------------------------------------------------------------
192 template <int NCOMP, int COMPSTRIDE, int P1d>
193 inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
194   if (data.tidx < P1d && data.tidy < P1d) {
195     const CeedInt node = data.tidx + data.tidy*P1d;
196     const CeedInt ind = indices[node + elem * P1d*P1d];
197     for (CeedInt comp = 0; comp < NCOMP; ++comp)
198       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
199   }
200 }
201 
202 //------------------------------------------------------------------------------
203 // L-vector -> E-vector, strided
204 //------------------------------------------------------------------------------
205 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
206 inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
207   if (data.tidx < P1d && data.tidy < P1d) {
208     const CeedInt node = data.tidx + data.tidy*P1d;
209     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
210     for (CeedInt comp = 0; comp < NCOMP; ++comp)
211       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
212   }
213 }
214 
215 //------------------------------------------------------------------------------
216 // E-vector -> L-vector, offsets provided
217 //------------------------------------------------------------------------------
218 template <int NCOMP, int COMPSTRIDE, int P1d>
219 inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
220   if (data.tidx < P1d && data.tidy < P1d) {
221     const CeedInt node = data.tidx + data.tidy*P1d;
222     const CeedInt ind = indices[node + elem * P1d*P1d];
223     for (CeedInt comp = 0; comp < NCOMP; ++comp)
224       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
225   }
226 }
227 
228 //------------------------------------------------------------------------------
229 // E-vector -> L-vector, strided
230 //------------------------------------------------------------------------------
231 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
232 inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
233   if (data.tidx < P1d && data.tidy < P1d) {
234     const CeedInt node = data.tidx + data.tidy*P1d;
235     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
236     for (CeedInt comp = 0; comp < NCOMP; ++comp)
237       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
238   }
239 }
240 
241 //------------------------------------------------------------------------------
242 // 2D tensor contraction x
243 //------------------------------------------------------------------------------
244 template <int NCOMP, int P1d, int Q1d>
245 inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
246   data.slice[data.tidx+data.tidy*T1d] = *U;
247   __syncthreads();
248   *V = 0.0;
249   if (data.tidx < Q1d && data.tidy < P1d)
250     for (CeedInt i = 0; i < P1d; ++i)
251       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
252   __syncthreads();
253 }
254 
255 //------------------------------------------------------------------------------
256 // 2D tensor contract y
257 //------------------------------------------------------------------------------
258 template <int NCOMP, int P1d, int Q1d>
259 inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
260   data.slice[data.tidx+data.tidy*T1d] = *U;
261   __syncthreads();
262   *V = 0.0;
263   if (data.tidx < Q1d && data.tidy < Q1d)
264     for (CeedInt i = 0; i < P1d; ++i)
265       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
266   __syncthreads();
267 }
268 
269 //------------------------------------------------------------------------------
270 // 2D transpose tensor contract y
271 //------------------------------------------------------------------------------
272 template <int NCOMP, int P1d, int Q1d>
273 inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
274   data.slice[data.tidx+data.tidy*T1d] = *U;
275   __syncthreads();
276   *V = 0.0;
277   if (data.tidx < Q1d && data.tidy < P1d)
278     for (CeedInt i = 0; i < Q1d; ++i)
279       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
280   __syncthreads();
281 }
282 
283 //------------------------------------------------------------------------------
284 // 2D transpose tensor contract x
285 //------------------------------------------------------------------------------
286 template <int NCOMP, int P1d, int Q1d>
287 inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
288   data.slice[data.tidx+data.tidy*T1d] = *U;
289   __syncthreads();
290   *V = 0.0;
291   if (data.tidx < P1d && data.tidy < P1d)
292     for (CeedInt i = 0; i < Q1d; ++i)
293       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
294   __syncthreads();
295 }
296 
297 //------------------------------------------------------------------------------
298 // 2D transpose tensor contract and add x
299 //------------------------------------------------------------------------------
300 template <int NCOMP, int P1d, int Q1d>
301 inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
302   data.slice[data.tidx+data.tidy*T1d] = *U;
303   __syncthreads();
304   if (data.tidx < P1d && data.tidy < P1d)
305     for (CeedInt i = 0; i < Q1d; ++i)
306       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
307   __syncthreads();
308 }
309 
310 //------------------------------------------------------------------------------
311 // 2D interpolate to quadrature points
312 //------------------------------------------------------------------------------
313 template <int NCOMP, int P1d, int Q1d>
314 inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
315   CeedScalar r_t[1];
316   for (CeedInt comp = 0; comp < NCOMP; comp++) {
317     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
318     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
319   }
320 }
321 
322 //------------------------------------------------------------------------------
323 // 2D interpolate transpose
324 //------------------------------------------------------------------------------
325 template <int NCOMP, int P1d, int Q1d>
326 inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
327   CeedScalar r_t[1];
328   for (CeedInt comp = 0; comp < NCOMP; comp++) {
329     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
330     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
331   }
332 }
333 
334 //------------------------------------------------------------------------------
335 // 2D derivatives at quadrature points
336 //------------------------------------------------------------------------------
337 template <int NCOMP, int P1d, int Q1d>
338 inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
339   CeedScalar r_t[1];
340   for (CeedInt comp = 0; comp < NCOMP; comp++) {
341     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
342     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
343     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
344     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
345   }
346 }
347 
348 //------------------------------------------------------------------------------
349 // 2D derivatives transpose
350 //------------------------------------------------------------------------------
351 template <int NCOMP, int P1d, int Q1d>
352 inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
353   CeedScalar r_t[1];
354   for (CeedInt comp = 0; comp < NCOMP; comp++) {
355     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
356     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
357     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
358     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
359   }
360 }
361 
362 //------------------------------------------------------------------------------
363 // 3D
364 //------------------------------------------------------------------------------
365 
366 //------------------------------------------------------------------------------
367 // L-vector -> E-vector, offsets provided
368 //------------------------------------------------------------------------------
369 template <int NCOMP, int COMPSTRIDE, int P1d>
370 inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
371   if (data.tidx < P1d && data.tidy < P1d)
372     for (CeedInt z = 0; z < P1d; ++z) {
373       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
374       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
375       for (CeedInt comp = 0; comp < NCOMP; ++comp)
376         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
377     }
378 }
379 
380 //------------------------------------------------------------------------------
381 // L-vector -> E-vector, strided
382 //------------------------------------------------------------------------------
383 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
384 inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
385   if (data.tidx < P1d && data.tidy < P1d)
386     for (CeedInt z = 0; z < P1d; ++z) {
387       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
388       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
389       for (CeedInt comp = 0; comp < NCOMP; ++comp)
390         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
391     }
392 }
393 
394 //------------------------------------------------------------------------------
395 // E-vector -> Q-vector, offests provided
396 //------------------------------------------------------------------------------
397 template <int NCOMP, int COMPSTRIDE, int Q1d>
398 inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
399   if (data.tidx < Q1d && data.tidy < Q1d) {
400     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
401     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
402     for (CeedInt comp = 0; comp < NCOMP; ++comp)
403       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
404   }
405 }
406 
407 //------------------------------------------------------------------------------
408 // E-vector -> Q-vector, strided
409 //------------------------------------------------------------------------------
410 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
411 inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
412   if (data.tidx < Q1d && data.tidy < Q1d) {
413     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
414     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
415     for (CeedInt comp = 0; comp < NCOMP; ++comp)
416       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
417   }
418 }
419 
420 //------------------------------------------------------------------------------
421 // E-vector -> L-vector, offsets provided
422 //------------------------------------------------------------------------------
423 template <int NCOMP, int COMPSTRIDE, int P1d>
424 inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
425   if (data.tidx < P1d && data.tidy < P1d)
426     for (CeedInt z = 0; z < P1d; ++z) {
427       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
428       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
429       for (CeedInt comp = 0; comp < NCOMP; ++comp)
430         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
431     }
432 }
433 
434 //------------------------------------------------------------------------------
435 // E-vector -> L-vector, strided
436 //------------------------------------------------------------------------------
437 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
438 inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
439   if (data.tidx < P1d && data.tidy < P1d)
440     for (CeedInt z = 0; z < P1d; ++z) {
441       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
442       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
443       for (CeedInt comp = 0; comp < NCOMP; ++comp)
444         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
445     }
446 }
447 
448 //------------------------------------------------------------------------------
449 // 3D tensor contract x
450 //------------------------------------------------------------------------------
451 template <int NCOMP, int P1d, int Q1d>
452 inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
453   CeedScalar r_B[P1d];
454   for (CeedInt i = 0; i < P1d; ++i)
455     r_B[i] = B[i + data.tidx*P1d];
456 
457   for (CeedInt k = 0; k < P1d; ++k) {
458     data.slice[data.tidx+data.tidy*T1d] = U[k];
459     __syncthreads();
460     V[k] = 0.0;
461     if (data.tidx < Q1d && data.tidy < P1d)
462       for (CeedInt i = 0; i < P1d; ++i)
463         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
464     __syncthreads();
465   }
466 }
467 
468 //------------------------------------------------------------------------------
469 // 3D tensor contract y
470 //------------------------------------------------------------------------------
471 template <int NCOMP, int P1d, int Q1d>
472 inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
473   CeedScalar r_B[P1d];
474   for (CeedInt i = 0; i < P1d; ++i)
475     r_B[i] = B[i + data.tidy*P1d];
476 
477   for (CeedInt k = 0; k < P1d; ++k) {
478     data.slice[data.tidx+data.tidy*T1d] = U[k];
479     __syncthreads();
480     V[k] = 0.0;
481     if (data.tidx < Q1d && data.tidy < Q1d)
482       for (CeedInt i = 0; i < P1d; ++i)
483         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
484     __syncthreads();
485   }
486 }
487 
488 //------------------------------------------------------------------------------
489 // 3D tensor contract z
490 //------------------------------------------------------------------------------
491 template <int NCOMP, int P1d, int Q1d>
492 inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
493   for (CeedInt k = 0; k < Q1d; ++k) {
494     V[k] = 0.0;
495     if (data.tidx < Q1d && data.tidy < Q1d)
496       for (CeedInt i = 0; i < P1d; ++i)
497         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
498   }
499 }
500 
501 //------------------------------------------------------------------------------
502 // 3D transpose tensor contract z
503 //------------------------------------------------------------------------------
504 template <int NCOMP, int P1d, int Q1d>
505 inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
506   for (CeedInt k = 0; k < P1d; ++k) {
507     V[k] = 0.0;
508     if (data.tidx < Q1d && data.tidy < Q1d)
509       for (CeedInt i = 0; i < Q1d; ++i)
510         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
511   }
512 }
513 
514 //------------------------------------------------------------------------------
515 // 3D transpose tensor contract y
516 //------------------------------------------------------------------------------
517 template <int NCOMP, int P1d, int Q1d>
518 inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
519   CeedScalar r_B[Q1d];
520   for (CeedInt i = 0; i < Q1d; ++i)
521     r_B[i] = B[data.tidy + i*P1d];
522 
523   for (CeedInt k = 0; k < P1d; ++k) {
524     data.slice[data.tidx+data.tidy*T1d] = U[k];
525     __syncthreads();
526     V[k] = 0.0;
527     if (data.tidx < Q1d && data.tidy < P1d)
528       for (CeedInt i = 0; i < Q1d; ++i)
529         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
530     __syncthreads();
531   }
532 }
533 
534 //------------------------------------------------------------------------------
535 // 3D transpose tensor contract add y
536 //------------------------------------------------------------------------------
537 template <int NCOMP, int P1d, int Q1d>
538 inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
539   CeedScalar r_B[Q1d];
540   for (CeedInt i = 0; i < Q1d; ++i)
541     r_B[i] = B[data.tidy + i*P1d];
542 
543   for (CeedInt k = 0; k < P1d; ++k) {
544     data.slice[data.tidx+data.tidy*T1d] = U[k];
545     __syncthreads();
546     if (data.tidx < Q1d && data.tidy < P1d)
547       for (CeedInt i = 0; i < Q1d; ++i)
548         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
549     __syncthreads();
550   }
551 }
552 
553 //------------------------------------------------------------------------------
554 // 3D transpose tensor contract x
555 //------------------------------------------------------------------------------
556 template <int NCOMP, int P1d, int Q1d>
557 inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
558   CeedScalar r_B[Q1d];
559   for (CeedInt i = 0; i < Q1d; ++i)
560     r_B[i] = B[data.tidx + i*P1d];
561 
562   for (CeedInt k = 0; k < P1d; ++k) {
563     data.slice[data.tidx+data.tidy*T1d] = U[k];
564     __syncthreads();
565     V[k] = 0.0;
566     if (data.tidx < P1d && data.tidy < P1d)
567       for (CeedInt i = 0; i < Q1d; ++i)
568         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
569     __syncthreads();
570   }
571 }
572 
573 //------------------------------------------------------------------------------
574 // 3D transpose tensor contract add x
575 //------------------------------------------------------------------------------
576 template <int NCOMP, int P1d, int Q1d>
577 inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
578   CeedScalar r_B[Q1d];
579   for (CeedInt i = 0; i < Q1d; ++i)
580     r_B[i] = B[data.tidx + i*P1d];
581 
582   for (CeedInt k = 0; k < P1d; ++k) {
583     data.slice[data.tidx+data.tidy*T1d] = U[k];
584     __syncthreads();
585     if (data.tidx < P1d && data.tidy < P1d)
586       for (CeedInt i = 0; i < Q1d; ++i)
587         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
588     __syncthreads();
589   }
590 }
591 
592 //------------------------------------------------------------------------------
593 // 3D interpolate to quadrature points
594 //------------------------------------------------------------------------------
595 template <int NCOMP, int P1d, int Q1d>
596 inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
597   CeedScalar r_t1[T1d];
598   CeedScalar r_t2[T1d];
599   for (CeedInt comp = 0; comp < NCOMP; comp++) {
600     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
601     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
602     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
603   }
604 }
605 
606 //------------------------------------------------------------------------------
607 // 3D interpolate transpose
608 //------------------------------------------------------------------------------
609 template <int NCOMP, int P1d, int Q1d>
610 inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
611   CeedScalar r_t1[T1d];
612   CeedScalar r_t2[T1d];
613   for (CeedInt comp = 0; comp < NCOMP; comp++) {
614     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
615     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
616     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
617   }
618 }
619 
620 //------------------------------------------------------------------------------
621 // 3D derivatives at quadrature points
622 //------------------------------------------------------------------------------
623 template <int NCOMP, int P1d, int Q1d>
624 inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
625   CeedScalar r_t1[T1d];
626   CeedScalar r_t2[T1d];
627   for (CeedInt comp = 0; comp < NCOMP; comp++) {
628     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
629     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
630     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
631     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
632     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
633     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
634     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
635     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
636     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
637   }
638 }
639 
640 //------------------------------------------------------------------------------
641 // 3D derivatives transpose
642 //------------------------------------------------------------------------------
643 template <int NCOMP, int P1d, int Q1d>
644 inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
645   CeedScalar r_t1[T1d];
646   CeedScalar r_t2[T1d];
647   for (CeedInt comp = 0; comp < NCOMP; comp++) {
648     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
649     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
650     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
651     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
652     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
653     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
654     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
655     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
656     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
657   }
658 }
659 
660 //------------------------------------------------------------------------------
661 // 3D collocated derivatives computation
662 //------------------------------------------------------------------------------
663 template <int NCOMP, int Q1d>
664 inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
665   if (data.tidx < Q1d && data.tidy < Q1d) {
666     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
667       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
668       __syncthreads();
669       // X derivative
670       r_V[comp+0*NCOMP] = 0.0;
671       for (CeedInt i = 0; i < Q1d; ++i)
672         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
673       // Y derivative
674       r_V[comp+1*NCOMP] = 0.0;
675       for (CeedInt i = 0; i < Q1d; ++i)
676         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
677       // Z derivative
678       r_V[comp+2*NCOMP] = 0.0;
679       for (CeedInt i = 0; i < Q1d; ++i)
680         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
681       __syncthreads();
682     }
683   }
684 }
685 
686 //------------------------------------------------------------------------------
687 // 3D collocated derivatives transpose
688 //------------------------------------------------------------------------------
689 template <int NCOMP, int Q1d>
690 inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
691   if (data.tidx < Q1d && data.tidy < Q1d) {
692     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
693       // X derivative
694       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
695       __syncthreads();
696       for (CeedInt i = 0; i < Q1d; ++i)
697         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
698       __syncthreads();
699       // Y derivative
700       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
701       __syncthreads();
702       for (CeedInt i = 0; i < Q1d; ++i)
703         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
704       __syncthreads();
705       // Z derivative
706       for (CeedInt i = 0; i < Q1d; ++i)
707         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
708     }
709   }
710 }
711 
712 //------------------------------------------------------------------------------
713 // 1D quadrature weights
714 //------------------------------------------------------------------------------
715 template <int Q1d>
716 inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
717   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
718 }
719 
720 //------------------------------------------------------------------------------
721 // 2D quadrature weights
722 //------------------------------------------------------------------------------
723 template <int Q1d>
724 inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
725   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
726         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
727 }
728 
729 //------------------------------------------------------------------------------
730 // 3D quadrature weights
731 //------------------------------------------------------------------------------
732 template <int Q1d>
733 inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
734   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
735   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
736   for (CeedInt z = 0; z < Q1d; ++z)
737     w[z] = quad ? pw*qweight1d[z] : 0.0;
738 }
739 
740 );
741 //------------------------------------------------------------------------------
742 // Build singe operator kernel
743 //------------------------------------------------------------------------------
744 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
745 
746   using std::ostringstream;
747   using std::string;
748   int ierr;
749   bool setupdone;
750   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
751   if (setupdone) return CEED_ERROR_SUCCESS;
752   Ceed ceed;
753   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
754   CeedOperator_Cuda_gen *data;
755   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
756   CeedQFunction qf;
757   CeedQFunction_Cuda_gen *qf_data;
758   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
759   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
760   CeedSize lsize;
761   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
762           numoutputfields, ncomp, dim = 1;
763   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
764   Q1d = Q;
765   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
766   CeedOperatorField *opinputfields, *opoutputfields;
767   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
768   CeedChkBackend(ierr);
769   CeedQFunctionField *qfinputfields, *qfoutputfields;
770   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
771   CeedChkBackend(ierr);
772   CeedEvalMode emode;
773   CeedBasis basis;
774   CeedBasis_Cuda_shared *basis_data;
775   CeedElemRestriction Erestrict;
776   CeedElemRestriction_Cuda *restr_data;
777 
778   // Check for restriction only identity operator
779   bool is_identity_qf;
780   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
781   if (is_identity_qf) {
782     CeedEvalMode emodein, emodeout;
783     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
784     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
785     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
786       // LCOV_EXCL_START
787       return CeedError(ceed, CEED_ERROR_BACKEND,
788                        "Backend does not implement restriction only identity operators");
789     // LCOV_EXCL_STOP
790   }
791 
792   ostringstream code;
793   string devFunctions(deviceFunctions);
794 
795   // Add atomicAdd function for old NVidia architectures
796   struct cudaDeviceProp prop;
797   Ceed_Cuda *ceed_data;
798   ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr);
799   ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr);
800   if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){
801     code << atomicAdd;
802   }
803 
804   code << devFunctions;
805 
806   string qFunction(qf_data->qFunctionSource);
807   string qFunctionName(qf_data->qFunctionName);
808   string oper;
809   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
810 
811   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
812   code << "#define CEED_QFUNCTION_HELPER inline __device__\n";
813   code << "#define CeedPragmaSIMD\n";
814   code << "#define CEED_ERROR_SUCCESS 0\n\n";
815 
816   // Find dim and Q1d
817   bool useCollograd = true;
818   bool allCollograd = true;
819   data->maxP1d = 0;
820   for (CeedInt i = 0; i < numinputfields; i++) {
821     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
822     if (basis != CEED_BASIS_COLLOCATED) {
823       allCollograd = false;
824       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
825       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
826       CeedChkBackend(ierr);
827 
828       // Check for collocated gradient
829       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
830 
831       // Collect dim and Q1d
832       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
833       bool isTensor;
834       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
835       if (isTensor) {
836         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
837         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
838         if (P1d>data->maxP1d) data->maxP1d = P1d;
839       } else {
840         // LCOV_EXCL_START
841         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
842         // LCOV_EXCL_STOP
843         }
844     }
845   }
846   // Check output bases for Q1d, dim as well
847   //   The only imput basis might be CEED_BASIS_COLLOCATED
848   for (CeedInt i = 0; i < numoutputfields; i++) {
849     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
850 
851     if (basis != CEED_BASIS_COLLOCATED) {
852       allCollograd = false;
853       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
854       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
855       CeedChkBackend(ierr);
856 
857       // Collect dim and Q1d
858       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
859       bool isTensor;
860       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
861       if (isTensor) {
862         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
863       } else {
864         // LCOV_EXCL_START
865         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
866         // LCOV_EXCL_STOP
867         }
868 
869       // Check for collocated gradient
870       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
871     }
872   }
873   data->dim = dim;
874   data->Q1d = Q1d;
875   // TODO: https://github.com/CEED/libCEED/pull/1009#issuecomment-1176751436
876   useCollograd &= !allCollograd;
877 
878   // Define CEED_Q_VLA
879   if (dim != 3 || useCollograd) {
880     code << "\n#define CEED_Q_VLA 1\n\n";
881   } else {
882     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
883   }
884 
885   code << qFunction;
886 
887   // Setup
888   code << "\n// -----------------------------------------------------------------------------\n";
889   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
890   for (CeedInt i = 0; i < numinputfields; i++) {
891     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
892     CeedChkBackend(ierr);
893     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
894       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
895     }
896   }
897 
898   for (CeedInt i = 0; i < numoutputfields; i++) {
899     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
900   }
901 
902   code << "  const CeedInt Dim = "<<dim<<";\n";
903   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
904 
905   code << "  extern __shared__ CeedScalar slice[];\n";
906   code << "  BackendData data;\n";
907   code << "  data.tidx = threadIdx.x;\n";
908   code << "  data.tidy = threadIdx.y;\n";
909   code << "  data.tidz = threadIdx.z;\n";
910   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
911   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
912 
913   code << "\n  // -- Input field constants and basis data --\n";
914   //Initialize constants, and matrices B and G
915   for (CeedInt i = 0; i < numinputfields; i++) {
916     code << "  // ---- Input field "<<i<<" ----\n";
917     // Get elemsize, emode, ncomp
918     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
919     CeedChkBackend(ierr);
920     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
921     CeedChkBackend(ierr);
922     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
923     CeedChkBackend(ierr);
924     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
925     CeedChkBackend(ierr);
926 
927     // Set field constants
928     if (emode != CEED_EVAL_WEIGHT) {
929       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
930       if (basis != CEED_BASIS_COLLOCATED) {
931         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
932         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
933       } else {
934         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
935       }
936       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
937     }
938 
939     // Load basis data
940     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
941     switch (emode) {
942     case CEED_EVAL_NONE:
943       break;
944     case CEED_EVAL_INTERP:
945       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
946       data->B.in[i] = basis_data->d_interp_1d;
947       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
948       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
949       break;
950     case CEED_EVAL_GRAD:
951       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
952       data->B.in[i] = basis_data->d_interp_1d;
953       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
954       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
955       if (useCollograd) {
956         data->G.in[i] = basis_data->d_collo_grad_1d;
957         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
958         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
959       } else {
960         data->G.in[i] = basis_data->d_grad_1d;
961         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
962         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
963       }
964       break;
965     case CEED_EVAL_WEIGHT:
966       break; // No action
967     case CEED_EVAL_DIV:
968       break; // TODO: Not implemented
969     case CEED_EVAL_CURL:
970       break; // TODO: Not implemented
971     }
972   }
973 
974   code << "\n  // -- Output field constants and basis data --\n";
975   for (CeedInt i = 0; i < numoutputfields; i++) {
976     code << "  // ---- Output field "<<i<<" ----\n";
977     // Get elemsize, emode, ncomp
978     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
979     CeedChkBackend(ierr);
980     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
981     CeedChkBackend(ierr);
982     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
983     CeedChkBackend(ierr);
984     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
985     CeedChkBackend(ierr);
986 
987     // Set field constants
988     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
989     if (basis != CEED_BASIS_COLLOCATED) {
990       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
991       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
992     } else {
993       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
994     }
995     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
996 
997     // Load basis data
998     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
999     switch (emode) {
1000     case CEED_EVAL_NONE:
1001       break; // No action
1002     case CEED_EVAL_INTERP:
1003       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1004       data->B.out[i] = basis_data->d_interp_1d;
1005       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1006       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1007       break;
1008     case CEED_EVAL_GRAD:
1009       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1010       data->B.out[i] = basis_data->d_interp_1d;
1011       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1012       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1013       if (useCollograd) {
1014         data->G.out[i] = basis_data->d_collo_grad_1d;
1015         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1016         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1017       } else {
1018         data->G.out[i] = basis_data->d_grad_1d;
1019         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1020         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1021       }
1022       break;
1023     // LCOV_EXCL_START
1024     case CEED_EVAL_WEIGHT: {
1025       Ceed ceed;
1026       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1027       return CeedError(ceed, CEED_ERROR_BACKEND,
1028                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1029       break; // Should not occur
1030     }
1031     case CEED_EVAL_DIV:
1032       break; // TODO: Not implemented
1033     case CEED_EVAL_CURL:
1034       break; // TODO: Not implemented
1035       // LCOV_EXCL_STOP
1036     }
1037   }
1038   code << "\n  // -- Element loop --\n";
1039   code << "  __syncthreads();\n";
1040   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1041   // Input basis apply if needed
1042   // Generate the correct eval mode code for each input
1043   code << "    // -- Input field restrictions and basis actions --\n";
1044   for (CeedInt i = 0; i < numinputfields; i++) {
1045     code << "    // ---- Input field "<<i<<" ----\n";
1046     // Get elemsize, emode, ncomp
1047     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1048     CeedChkBackend(ierr);
1049     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1050     CeedChkBackend(ierr);
1051     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1052     CeedChkBackend(ierr);
1053     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1054     CeedChkBackend(ierr);
1055 
1056     // Restriction
1057     if (emode != CEED_EVAL_WEIGHT &&
1058         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1059       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1060 
1061       bool isStrided;
1062       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1063       if (!isStrided) {
1064         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1065         CeedChkBackend(ierr);
1066         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1067         CeedInt compstride;
1068         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1069         code << "    // CompStride: "<<compstride<<"\n";
1070         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1071         data->indices.in[i] = restr_data->d_ind;
1072         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1073       } else {
1074         bool backendstrides;
1075         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1076         CeedChkBackend(ierr);
1077         CeedInt nelem;
1078         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1079         CeedChkBackend(ierr);
1080         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1081         if (!backendstrides) {
1082           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1083           CeedChkBackend(ierr);
1084         }
1085         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1086         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1087       }
1088     }
1089 
1090     // Basis action
1091     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1092     switch (emode) {
1093     case CEED_EVAL_NONE:
1094       if (!useCollograd) {
1095         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1096       }
1097       break;
1098     case CEED_EVAL_INTERP:
1099       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1100       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1101       break;
1102     case CEED_EVAL_GRAD:
1103       if (useCollograd) {
1104         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1105         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1106       } else {
1107         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1108         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1109       }
1110       break;
1111     case CEED_EVAL_WEIGHT:
1112       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1113       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1114       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1115       data->W = basis_data->d_q_weight_1d;
1116       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1117       break; // No action
1118     case CEED_EVAL_DIV:
1119       break; // TODO: Not implemented
1120     case CEED_EVAL_CURL:
1121       break; // TODO: Not implemented
1122     }
1123   }
1124 
1125   // Q function
1126   code << "\n    // -- Output field setup --\n";
1127   for (CeedInt i = 0; i < numoutputfields; i++) {
1128       code << "\n    // ---- Output field "<<i<<" ----\n";
1129     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1130     CeedChkBackend(ierr);
1131     if (emode==CEED_EVAL_GRAD)
1132     {
1133       if (useCollograd) {
1134         //Accumulator for gradient slices
1135         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1136         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1137         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1138         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1139         code << "      }\n";
1140         code << "    }\n";
1141       } else {
1142         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1143       }
1144     }
1145     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1146     {
1147       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1148     }
1149   }
1150   // We treat quadrature points per slice in 3d to save registers
1151   if (useCollograd) {
1152     code << "\n    // Note: Collocated Gradient\n";
1153     code << "#pragma unroll\n";
1154     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1155     code << "      // -- Input fields --\n";
1156     for (CeedInt i = 0; i < numinputfields; i++) {
1157       code << "      // ---- Input field "<<i<<" ----\n";
1158       // Get elemsize, emode, ncomp
1159       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1160       CeedChkBackend(ierr);
1161       // Basis action
1162       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1163       switch (emode) {
1164       case CEED_EVAL_NONE:
1165         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1166 
1167         bool isStrided;
1168         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1169         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1170         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1171         if (!isStrided) {
1172           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1173           CeedChkBackend(ierr);
1174           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1175           CeedInt compstride;
1176           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1177           code << "      // CompStride: "<<compstride<<"\n";
1178           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1179           data->indices.in[i] = restr_data->d_ind;
1180           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1181         } else {
1182           bool backendstrides;
1183           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1184           CeedChkBackend(ierr);
1185           CeedInt nelem;
1186           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1187           CeedChkBackend(ierr);
1188           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1189           if (!backendstrides) {
1190             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1191             CeedChkBackend(ierr);
1192           }
1193           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1194           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1195         }
1196         break;
1197       case CEED_EVAL_INTERP:
1198         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1199         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1200         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1201         code << "      }\n";
1202         break;
1203       case CEED_EVAL_GRAD:
1204         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1205         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1206         break;
1207       case CEED_EVAL_WEIGHT:
1208         code << "      CeedScalar r_q"<<i<<"[1];\n";
1209         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1210         break; // No action
1211       case CEED_EVAL_DIV:
1212         break; // TODO: Not implemented
1213       case CEED_EVAL_CURL:
1214         break; // TODO: Not implemented
1215       }
1216     }
1217     code << "\n      // -- Output fields --\n";
1218     for (CeedInt i = 0; i < numoutputfields; i++) {
1219       code << "      // ---- Output field "<<i<<" ----\n";
1220       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1221       CeedChkBackend(ierr);
1222       // Basis action
1223       switch (emode) {
1224       case CEED_EVAL_NONE:
1225         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1226         break; // No action
1227       case CEED_EVAL_INTERP:
1228         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1229         break;
1230       case CEED_EVAL_GRAD:
1231         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1232         break;
1233       case CEED_EVAL_WEIGHT:
1234         break; // Should not occur
1235       case CEED_EVAL_DIV:
1236         break; // TODO: Not implemented
1237       case CEED_EVAL_CURL:
1238         break; // TODO: Not implemented
1239       }
1240     }
1241   } else {
1242     code << "\n      // Note: No Collocated Gradient\n";
1243     code << "      // -- Input fields --\n";
1244     for (CeedInt i = 0; i < numinputfields; i++) {
1245       code << "      // ---- Input field "<<i<<" ----\n";
1246       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1247     }
1248     code << "      // -- Output fields --\n";
1249     for (CeedInt i = 0; i < numoutputfields; i++) {
1250       code << "      // ---- Output field "<<i<<" ----\n";
1251       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1252     }
1253   }
1254   code << "\n      // -- QFunction Inputs and outputs --\n";
1255   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1256   for (CeedInt i = 0; i < numinputfields; i++) {
1257     code << "      // ---- Input field "<<i<<" ----\n";
1258     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1259   }
1260   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1261   for (CeedInt i = 0; i < numoutputfields; i++) {
1262     code << "      // ---- Output field "<<i<<" ----\n";
1263     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1264   }
1265   code << "\n      // -- Apply QFunction --\n";
1266   code << "      "<<qFunctionName<<"(ctx, ";
1267   if (dim != 3 || useCollograd) {
1268     code << "1";
1269   } else {
1270     code << "Q1d";
1271   }
1272   code << ", in, out);\n";
1273   if (useCollograd) {
1274     code << "\n      // Note: Collocated Gradient\n";
1275     code << "      // -- Output fields --\n";
1276     for (CeedInt i = 0; i < numoutputfields; i++) {
1277       code << "      // ---- Output field "<<i<<" ----\n";
1278       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1279       CeedChkBackend(ierr);
1280       // Basis action
1281       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1282       switch (emode) {
1283       case CEED_EVAL_NONE:
1284         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1285         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1286         code << "      }\n";
1287         break; // No action
1288       case CEED_EVAL_INTERP:
1289         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1290         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1291         code << "      }\n";
1292         break;
1293       case CEED_EVAL_GRAD:
1294         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1295         break;
1296       case CEED_EVAL_WEIGHT:
1297         break; // Should not occur
1298       case CEED_EVAL_DIV:
1299         break; // TODO: Not implemented
1300       case CEED_EVAL_CURL:
1301         break; // TODO: Not implemented
1302       }
1303     }
1304     code << "    }\n";
1305   }
1306 
1307   // Output basis apply if needed
1308   // Generate the correct eval mode code for each output
1309   code << "\n    // -- Output field basis action and restrictions --\n";
1310   for (CeedInt i = 0; i < numoutputfields; i++) {
1311     code << "    // ---- Output field "<<i<<" ----\n";
1312     // Get elemsize, emode, ncomp
1313     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1314     CeedChkBackend(ierr);
1315     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1316     CeedChkBackend(ierr);
1317     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1318     CeedChkBackend(ierr);
1319     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1320     CeedChkBackend(ierr);
1321     // Basis action
1322     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1323     switch (emode) {
1324     case CEED_EVAL_NONE:
1325       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1326       break; // No action
1327     case CEED_EVAL_INTERP:
1328       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1329       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1330       break;
1331     case CEED_EVAL_GRAD:
1332       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1333       if (useCollograd) {
1334         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1335       } else {
1336         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1337       }
1338       break;
1339     // LCOV_EXCL_START
1340     case CEED_EVAL_WEIGHT: {
1341       Ceed ceed;
1342       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1343       return CeedError(ceed, CEED_ERROR_BACKEND,
1344                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1345       break; // Should not occur
1346     }
1347     case CEED_EVAL_DIV:
1348       break; // TODO: Not implemented
1349     case CEED_EVAL_CURL:
1350       break; // TODO: Not implemented
1351       // LCOV_EXCL_STOP
1352     }
1353     // Restriction
1354       bool isStrided;
1355       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1356     if (!isStrided) {
1357       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1358       CeedChkBackend(ierr);
1359       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1360       CeedInt compstride;
1361       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1362       code << "    // CompStride: "<<compstride<<"\n";
1363       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1364       data->indices.out[i] = restr_data->d_ind;
1365       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1366     } else {
1367       bool backendstrides;
1368       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1369       CeedChkBackend(ierr);
1370       CeedInt nelem;
1371       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1372       CeedChkBackend(ierr);
1373       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1374       if (!backendstrides) {
1375         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1376         CeedChkBackend(ierr);
1377       }
1378       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1379       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1380     }
1381   }
1382 
1383   code << "  }\n";
1384   code << "}\n";
1385   code << "// -----------------------------------------------------------------------------\n\n";
1386 
1387   // View kernel for debugging
1388   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
1389   CeedDebug(ceed, code.str().c_str());
1390 
1391   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
1392                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1393   CeedChkBackend(ierr);
1394   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1395   CeedChkBackend(ierr);
1396 
1397   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1398   return CEED_ERROR_SUCCESS;
1399 }
1400 //------------------------------------------------------------------------------
1401