xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 28d09c206720ba8516c835daf824672e3380cdbd)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #define CEED_DEBUG_COLOR 12
18 
19 #include <ceed/ceed.h>
20 #include <ceed/backend.h>
21 #include <cuda_runtime.h>
22 #include <iostream>
23 #include <sstream>
24 #include "ceed-cuda-gen.h"
25 #include "../cuda-shared/ceed-cuda-shared.h"
26 
27 static const char *atomicAdd = QUOTE(
28 //------------------------------------------------------------------------------
29 // Atomic add, for older CUDA
30 //------------------------------------------------------------------------------
31 __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) {
32   unsigned long long int *address_as_ull = (unsigned long long int *)address;
33   unsigned long long int old = *address_as_ull, assumed;
34   do {
35     assumed = old;
36     old =
37       atomicCAS(address_as_ull, assumed,
38                 __double_as_longlong(val +
39                                     __longlong_as_double(assumed)));
40     // Note: uses integer comparison to avoid hang in case of NaN
41     // (since NaN != NaN)
42   } while (assumed != old);
43   return __longlong_as_double(old);
44 }
45 );
46 
47 static const char *deviceFunctions = QUOTE(
48 
49 //------------------------------------------------------------------------------
50 // Typedefs
51 //------------------------------------------------------------------------------
52 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
53 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
54 
55 typedef struct {
56   CeedInt tidx;
57   CeedInt tidy;
58   CeedInt tidz;
59   CeedInt tid;
60   CeedScalar* slice;
61 } BackendData;
62 
63 //------------------------------------------------------------------------------
64 // Load matrices for basis actions
65 //------------------------------------------------------------------------------
66 template <int P, int Q>
67 inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
68   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
69     B[i] = d_B[i];
70 }
71 
72 //------------------------------------------------------------------------------
73 // 1D
74 //------------------------------------------------------------------------------
75 
76 //------------------------------------------------------------------------------
77 // L-vector -> E-vector, offsets provided
78 //------------------------------------------------------------------------------
79 template <int NCOMP, int COMPSTRIDE, int P1d>
80 inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
81   if (data.tidx < P1d) {
82     const CeedInt node = data.tidx;
83     const CeedInt ind = indices[node + elem * P1d];
84     for (CeedInt comp = 0; comp < NCOMP; ++comp)
85       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
86   }
87 }
88 
89 //------------------------------------------------------------------------------
90 // L-vector -> E-vector, strided
91 //------------------------------------------------------------------------------
92 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
93 inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
94   if (data.tidx < P1d) {
95     const CeedInt node = data.tidx;
96     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
97     for (CeedInt comp = 0; comp < NCOMP; ++comp)
98       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
99   }
100 }
101 
102 //------------------------------------------------------------------------------
103 // E-vector -> L-vector, offsets provided
104 //------------------------------------------------------------------------------
105 template <int NCOMP, int COMPSTRIDE, int P1d>
106 inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
107   if (data.tidx < P1d) {
108     const CeedInt node = data.tidx;
109     const CeedInt ind = indices[node + elem * P1d];
110     for (CeedInt comp = 0; comp < NCOMP; ++comp)
111       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
112   }
113 }
114 
115 //------------------------------------------------------------------------------
116 // E-vector -> L-vector, strided
117 //------------------------------------------------------------------------------
118 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
119 inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
120   if (data.tidx < P1d) {
121     const CeedInt node = data.tidx;
122     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
123     for (CeedInt comp = 0; comp < NCOMP; ++comp)
124       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
125   }
126 }
127 
128 //------------------------------------------------------------------------------
129 // 1D tensor contraction x
130 //------------------------------------------------------------------------------
131 template <int NCOMP, int P1d, int Q1d>
132 inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
133   data.slice[data.tidx] = *U;
134   __syncthreads();
135   *V = 0.0;
136   if (data.tidx < Q1d)
137     for (CeedInt i = 0; i < P1d; ++i)
138       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
139   __syncthreads();
140 }
141 
142 //------------------------------------------------------------------------------
143 // 1D transpose tensor contraction x
144 //------------------------------------------------------------------------------
145 template <int NCOMP, int P1d, int Q1d>
146 inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
147   data.slice[data.tidx] = *U;
148   __syncthreads();
149   *V = 0.0;
150   if (data.tidx < P1d)
151     for (CeedInt i = 0; i < Q1d; ++i)
152       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
153   __syncthreads();
154 }
155 
156 //------------------------------------------------------------------------------
157 // 1D interpolate to quadrature points
158 //------------------------------------------------------------------------------
159 template <int NCOMP, int P1d, int Q1d>
160 inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
161   for (CeedInt comp = 0; comp < NCOMP; comp++)
162     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
163 }
164 
165 //------------------------------------------------------------------------------
166 // 1D interpolate transpose
167 //------------------------------------------------------------------------------
168 template <int NCOMP, int P1d, int Q1d>
169 inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
170   for (CeedInt comp=0; comp<NCOMP; comp++)
171     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
172 }
173 
174 //------------------------------------------------------------------------------
175 // 1D derivatives at quadrature points
176 //------------------------------------------------------------------------------
177 template <int NCOMP, int P1d, int Q1d>
178 inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
179   for (CeedInt comp = 0; comp < NCOMP; comp++)
180     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
181 }
182 
183 //------------------------------------------------------------------------------
184 // 1D derivatives transpose
185 //------------------------------------------------------------------------------
186 template <int NCOMP, int P1d, int Q1d>
187 inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
188   for (CeedInt comp = 0; comp < NCOMP; comp++)
189     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
190 }
191 
192 //------------------------------------------------------------------------------
193 // 2D
194 //------------------------------------------------------------------------------
195 
196 //------------------------------------------------------------------------------
197 // L-vector -> E-vector, offsets provided
198 //------------------------------------------------------------------------------
199 template <int NCOMP, int COMPSTRIDE, int P1d>
200 inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
201   if (data.tidx < P1d && data.tidy < P1d) {
202     const CeedInt node = data.tidx + data.tidy*P1d;
203     const CeedInt ind = indices[node + elem * P1d*P1d];
204     for (CeedInt comp = 0; comp < NCOMP; ++comp)
205       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
206   }
207 }
208 
209 //------------------------------------------------------------------------------
210 // L-vector -> E-vector, strided
211 //------------------------------------------------------------------------------
212 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
213 inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
214   if (data.tidx < P1d && data.tidy < P1d) {
215     const CeedInt node = data.tidx + data.tidy*P1d;
216     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
217     for (CeedInt comp = 0; comp < NCOMP; ++comp)
218       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
219   }
220 }
221 
222 //------------------------------------------------------------------------------
223 // E-vector -> L-vector, offsets provided
224 //------------------------------------------------------------------------------
225 template <int NCOMP, int COMPSTRIDE, int P1d>
226 inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
227   if (data.tidx < P1d && data.tidy < P1d) {
228     const CeedInt node = data.tidx + data.tidy*P1d;
229     const CeedInt ind = indices[node + elem * P1d*P1d];
230     for (CeedInt comp = 0; comp < NCOMP; ++comp)
231       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
232   }
233 }
234 
235 //------------------------------------------------------------------------------
236 // E-vector -> L-vector, strided
237 //------------------------------------------------------------------------------
238 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
239 inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
240   if (data.tidx < P1d && data.tidy < P1d) {
241     const CeedInt node = data.tidx + data.tidy*P1d;
242     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
243     for (CeedInt comp = 0; comp < NCOMP; ++comp)
244       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
245   }
246 }
247 
248 //------------------------------------------------------------------------------
249 // 2D tensor contraction x
250 //------------------------------------------------------------------------------
251 template <int NCOMP, int P1d, int Q1d>
252 inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
253   data.slice[data.tidx+data.tidy*T1d] = *U;
254   __syncthreads();
255   *V = 0.0;
256   if (data.tidx < Q1d && data.tidy < P1d)
257     for (CeedInt i = 0; i < P1d; ++i)
258       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
259   __syncthreads();
260 }
261 
262 //------------------------------------------------------------------------------
263 // 2D tensor contract y
264 //------------------------------------------------------------------------------
265 template <int NCOMP, int P1d, int Q1d>
266 inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
267   data.slice[data.tidx+data.tidy*T1d] = *U;
268   __syncthreads();
269   *V = 0.0;
270   if (data.tidx < Q1d && data.tidy < Q1d)
271     for (CeedInt i = 0; i < P1d; ++i)
272       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
273   __syncthreads();
274 }
275 
276 //------------------------------------------------------------------------------
277 // 2D transpose tensor contract y
278 //------------------------------------------------------------------------------
279 template <int NCOMP, int P1d, int Q1d>
280 inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
281   data.slice[data.tidx+data.tidy*T1d] = *U;
282   __syncthreads();
283   *V = 0.0;
284   if (data.tidx < Q1d && data.tidy < P1d)
285     for (CeedInt i = 0; i < Q1d; ++i)
286       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
287   __syncthreads();
288 }
289 
290 //------------------------------------------------------------------------------
291 // 2D transpose tensor contract x
292 //------------------------------------------------------------------------------
293 template <int NCOMP, int P1d, int Q1d>
294 inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
295   data.slice[data.tidx+data.tidy*T1d] = *U;
296   __syncthreads();
297   *V = 0.0;
298   if (data.tidx < P1d && data.tidy < P1d)
299     for (CeedInt i = 0; i < Q1d; ++i)
300       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
301   __syncthreads();
302 }
303 
304 //------------------------------------------------------------------------------
305 // 2D transpose tensor contract and add x
306 //------------------------------------------------------------------------------
307 template <int NCOMP, int P1d, int Q1d>
308 inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
309   data.slice[data.tidx+data.tidy*T1d] = *U;
310   __syncthreads();
311   if (data.tidx < P1d && data.tidy < P1d)
312     for (CeedInt i = 0; i < Q1d; ++i)
313       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
314   __syncthreads();
315 }
316 
317 //------------------------------------------------------------------------------
318 // 2D interpolate to quadrature points
319 //------------------------------------------------------------------------------
320 template <int NCOMP, int P1d, int Q1d>
321 inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
322   CeedScalar r_t[1];
323   for (CeedInt comp = 0; comp < NCOMP; comp++) {
324     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
325     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
326   }
327 }
328 
329 //------------------------------------------------------------------------------
330 // 2D interpolate transpose
331 //------------------------------------------------------------------------------
332 template <int NCOMP, int P1d, int Q1d>
333 inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
334   CeedScalar r_t[1];
335   for (CeedInt comp = 0; comp < NCOMP; comp++) {
336     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
337     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
338   }
339 }
340 
341 //------------------------------------------------------------------------------
342 // 2D derivatives at quadrature points
343 //------------------------------------------------------------------------------
344 template <int NCOMP, int P1d, int Q1d>
345 inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
346   CeedScalar r_t[1];
347   for (CeedInt comp = 0; comp < NCOMP; comp++) {
348     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
349     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
350     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
351     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
352   }
353 }
354 
355 //------------------------------------------------------------------------------
356 // 2D derivatives transpose
357 //------------------------------------------------------------------------------
358 template <int NCOMP, int P1d, int Q1d>
359 inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
360   CeedScalar r_t[1];
361   for (CeedInt comp = 0; comp < NCOMP; comp++) {
362     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
363     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
364     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
365     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
366   }
367 }
368 
369 //------------------------------------------------------------------------------
370 // 3D
371 //------------------------------------------------------------------------------
372 
373 //------------------------------------------------------------------------------
374 // L-vector -> E-vector, offsets provided
375 //------------------------------------------------------------------------------
376 template <int NCOMP, int COMPSTRIDE, int P1d>
377 inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
378   if (data.tidx < P1d && data.tidy < P1d)
379     for (CeedInt z = 0; z < P1d; ++z) {
380       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
381       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
382       for (CeedInt comp = 0; comp < NCOMP; ++comp)
383         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
384     }
385 }
386 
387 //------------------------------------------------------------------------------
388 // L-vector -> E-vector, strided
389 //------------------------------------------------------------------------------
390 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
391 inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
392   if (data.tidx < P1d && data.tidy < P1d)
393     for (CeedInt z = 0; z < P1d; ++z) {
394       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
395       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
396       for (CeedInt comp = 0; comp < NCOMP; ++comp)
397         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
398     }
399 }
400 
401 //------------------------------------------------------------------------------
402 // E-vector -> Q-vector, offests provided
403 //------------------------------------------------------------------------------
404 template <int NCOMP, int COMPSTRIDE, int Q1d>
405 inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
406   if (data.tidx < Q1d && data.tidy < Q1d) {
407     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
408     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
409     for (CeedInt comp = 0; comp < NCOMP; ++comp)
410       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
411   }
412 }
413 
414 //------------------------------------------------------------------------------
415 // E-vector -> Q-vector, strided
416 //------------------------------------------------------------------------------
417 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
418 inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
419   if (data.tidx < Q1d && data.tidy < Q1d) {
420     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
421     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
422     for (CeedInt comp = 0; comp < NCOMP; ++comp)
423       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
424   }
425 }
426 
427 //------------------------------------------------------------------------------
428 // E-vector -> L-vector, offsets provided
429 //------------------------------------------------------------------------------
430 template <int NCOMP, int COMPSTRIDE, int P1d>
431 inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
432   if (data.tidx < P1d && data.tidy < P1d)
433     for (CeedInt z = 0; z < P1d; ++z) {
434       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
435       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
436       for (CeedInt comp = 0; comp < NCOMP; ++comp)
437         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
438     }
439 }
440 
441 //------------------------------------------------------------------------------
442 // E-vector -> L-vector, strided
443 //------------------------------------------------------------------------------
444 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
445 inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
446   if (data.tidx < P1d && data.tidy < P1d)
447     for (CeedInt z = 0; z < P1d; ++z) {
448       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
449       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
450       for (CeedInt comp = 0; comp < NCOMP; ++comp)
451         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
452     }
453 }
454 
455 //------------------------------------------------------------------------------
456 // 3D tensor contract x
457 //------------------------------------------------------------------------------
458 template <int NCOMP, int P1d, int Q1d>
459 inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
460   CeedScalar r_B[P1d];
461   for (CeedInt i = 0; i < P1d; ++i)
462     r_B[i] = B[i + data.tidx*P1d];
463 
464   for (CeedInt k = 0; k < P1d; ++k) {
465     data.slice[data.tidx+data.tidy*T1d] = U[k];
466     __syncthreads();
467     V[k] = 0.0;
468     if (data.tidx < Q1d && data.tidy < P1d)
469       for (CeedInt i = 0; i < P1d; ++i)
470         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
471     __syncthreads();
472   }
473 }
474 
475 //------------------------------------------------------------------------------
476 // 3D tensor contract y
477 //------------------------------------------------------------------------------
478 template <int NCOMP, int P1d, int Q1d>
479 inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
480   CeedScalar r_B[P1d];
481   for (CeedInt i = 0; i < P1d; ++i)
482     r_B[i] = B[i + data.tidy*P1d];
483 
484   for (CeedInt k = 0; k < P1d; ++k) {
485     data.slice[data.tidx+data.tidy*T1d] = U[k];
486     __syncthreads();
487     V[k] = 0.0;
488     if (data.tidx < Q1d && data.tidy < Q1d)
489       for (CeedInt i = 0; i < P1d; ++i)
490         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
491     __syncthreads();
492   }
493 }
494 
495 //------------------------------------------------------------------------------
496 // 3D tensor contract z
497 //------------------------------------------------------------------------------
498 template <int NCOMP, int P1d, int Q1d>
499 inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
500   for (CeedInt k = 0; k < Q1d; ++k) {
501     V[k] = 0.0;
502     if (data.tidx < Q1d && data.tidy < Q1d)
503       for (CeedInt i = 0; i < P1d; ++i)
504         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
505   }
506 }
507 
508 //------------------------------------------------------------------------------
509 // 3D transpose tensor contract z
510 //------------------------------------------------------------------------------
511 template <int NCOMP, int P1d, int Q1d>
512 inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
513   for (CeedInt k = 0; k < P1d; ++k) {
514     V[k] = 0.0;
515     if (data.tidx < Q1d && data.tidy < Q1d)
516       for (CeedInt i = 0; i < Q1d; ++i)
517         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
518   }
519 }
520 
521 //------------------------------------------------------------------------------
522 // 3D transpose tensor contract y
523 //------------------------------------------------------------------------------
524 template <int NCOMP, int P1d, int Q1d>
525 inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
526   CeedScalar r_B[Q1d];
527   for (CeedInt i = 0; i < Q1d; ++i)
528     r_B[i] = B[data.tidy + i*P1d];
529 
530   for (CeedInt k = 0; k < P1d; ++k) {
531     data.slice[data.tidx+data.tidy*T1d] = U[k];
532     __syncthreads();
533     V[k] = 0.0;
534     if (data.tidx < Q1d && data.tidy < P1d)
535       for (CeedInt i = 0; i < Q1d; ++i)
536         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
537     __syncthreads();
538   }
539 }
540 
541 //------------------------------------------------------------------------------
542 // 3D transpose tensor contract add y
543 //------------------------------------------------------------------------------
544 template <int NCOMP, int P1d, int Q1d>
545 inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
546   CeedScalar r_B[Q1d];
547   for (CeedInt i = 0; i < Q1d; ++i)
548     r_B[i] = B[data.tidy + i*P1d];
549 
550   for (CeedInt k = 0; k < P1d; ++k) {
551     data.slice[data.tidx+data.tidy*T1d] = U[k];
552     __syncthreads();
553     if (data.tidx < Q1d && data.tidy < P1d)
554       for (CeedInt i = 0; i < Q1d; ++i)
555         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
556     __syncthreads();
557   }
558 }
559 
560 //------------------------------------------------------------------------------
561 // 3D transpose tensor contract x
562 //------------------------------------------------------------------------------
563 template <int NCOMP, int P1d, int Q1d>
564 inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
565   CeedScalar r_B[Q1d];
566   for (CeedInt i = 0; i < Q1d; ++i)
567     r_B[i] = B[data.tidx + i*P1d];
568 
569   for (CeedInt k = 0; k < P1d; ++k) {
570     data.slice[data.tidx+data.tidy*T1d] = U[k];
571     __syncthreads();
572     V[k] = 0.0;
573     if (data.tidx < P1d && data.tidy < P1d)
574       for (CeedInt i = 0; i < Q1d; ++i)
575         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
576     __syncthreads();
577   }
578 }
579 
580 //------------------------------------------------------------------------------
581 // 3D transpose tensor contract add x
582 //------------------------------------------------------------------------------
583 template <int NCOMP, int P1d, int Q1d>
584 inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
585   CeedScalar r_B[Q1d];
586   for (CeedInt i = 0; i < Q1d; ++i)
587     r_B[i] = B[data.tidx + i*P1d];
588 
589   for (CeedInt k = 0; k < P1d; ++k) {
590     data.slice[data.tidx+data.tidy*T1d] = U[k];
591     __syncthreads();
592     if (data.tidx < P1d && data.tidy < P1d)
593       for (CeedInt i = 0; i < Q1d; ++i)
594         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
595     __syncthreads();
596   }
597 }
598 
599 //------------------------------------------------------------------------------
600 // 3D interpolate to quadrature points
601 //------------------------------------------------------------------------------
602 template <int NCOMP, int P1d, int Q1d>
603 inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
604   CeedScalar r_t1[T1d];
605   CeedScalar r_t2[T1d];
606   for (CeedInt comp = 0; comp < NCOMP; comp++) {
607     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
608     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
609     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
610   }
611 }
612 
613 //------------------------------------------------------------------------------
614 // 3D interpolate transpose
615 //------------------------------------------------------------------------------
616 template <int NCOMP, int P1d, int Q1d>
617 inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
618   CeedScalar r_t1[T1d];
619   CeedScalar r_t2[T1d];
620   for (CeedInt comp = 0; comp < NCOMP; comp++) {
621     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
622     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
623     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
624   }
625 }
626 
627 //------------------------------------------------------------------------------
628 // 3D derivatives at quadrature points
629 //------------------------------------------------------------------------------
630 template <int NCOMP, int P1d, int Q1d>
631 inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
632   CeedScalar r_t1[T1d];
633   CeedScalar r_t2[T1d];
634   for (CeedInt comp = 0; comp < NCOMP; comp++) {
635     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
636     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
637     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
638     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
639     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
640     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
641     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
642     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
643     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
644   }
645 }
646 
647 //------------------------------------------------------------------------------
648 // 3D derivatives transpose
649 //------------------------------------------------------------------------------
650 template <int NCOMP, int P1d, int Q1d>
651 inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
652   CeedScalar r_t1[T1d];
653   CeedScalar r_t2[T1d];
654   for (CeedInt comp = 0; comp < NCOMP; comp++) {
655     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
656     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
657     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
658     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
659     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
660     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
661     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
662     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
663     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
664   }
665 }
666 
667 //------------------------------------------------------------------------------
668 // 3D collocated derivatives computation
669 //------------------------------------------------------------------------------
670 template <int NCOMP, int Q1d>
671 inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
672   if (data.tidx < Q1d && data.tidy < Q1d) {
673     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
674       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
675       __syncthreads();
676       // X derivative
677       r_V[comp+0*NCOMP] = 0.0;
678       for (CeedInt i = 0; i < Q1d; ++i)
679         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
680       // Y derivative
681       r_V[comp+1*NCOMP] = 0.0;
682       for (CeedInt i = 0; i < Q1d; ++i)
683         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
684       // Z derivative
685       r_V[comp+2*NCOMP] = 0.0;
686       for (CeedInt i = 0; i < Q1d; ++i)
687         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
688       __syncthreads();
689     }
690   }
691 }
692 
693 //------------------------------------------------------------------------------
694 // 3D collocated derivatives transpose
695 //------------------------------------------------------------------------------
696 template <int NCOMP, int Q1d>
697 inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
698   if (data.tidx < Q1d && data.tidy < Q1d) {
699     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
700       // X derivative
701       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
702       __syncthreads();
703       for (CeedInt i = 0; i < Q1d; ++i)
704         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
705       __syncthreads();
706       // Y derivative
707       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
708       __syncthreads();
709       for (CeedInt i = 0; i < Q1d; ++i)
710         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
711       __syncthreads();
712       // Z derivative
713       for (CeedInt i = 0; i < Q1d; ++i)
714         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
715     }
716   }
717 }
718 
719 //------------------------------------------------------------------------------
720 // 1D quadrature weights
721 //------------------------------------------------------------------------------
722 template <int Q1d>
723 inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
724   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
725 }
726 
727 //------------------------------------------------------------------------------
728 // 2D quadrature weights
729 //------------------------------------------------------------------------------
730 template <int Q1d>
731 inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
732   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
733         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
734 }
735 
736 //------------------------------------------------------------------------------
737 // 3D quadrature weights
738 //------------------------------------------------------------------------------
739 template <int Q1d>
740 inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
741   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
742   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
743   for (CeedInt z = 0; z < Q1d; ++z)
744     w[z] = quad ? pw*qweight1d[z] : 0.0;
745 }
746 
747 );
748 //------------------------------------------------------------------------------
749 // Build singe operator kernel
750 //------------------------------------------------------------------------------
751 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
752 
753   using std::ostringstream;
754   using std::string;
755   int ierr;
756   bool setupdone;
757   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
758   if (setupdone) return CEED_ERROR_SUCCESS;
759   Ceed ceed;
760   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
761   CeedOperator_Cuda_gen *data;
762   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
763   CeedQFunction qf;
764   CeedQFunction_Cuda_gen *qf_data;
765   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
766   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
767   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
768           numoutputfields, ncomp, dim = 0, lsize;
769   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
770   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
771   CeedOperatorField *opinputfields, *opoutputfields;
772   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
773   CeedChkBackend(ierr);
774   CeedQFunctionField *qfinputfields, *qfoutputfields;
775   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
776   CeedChkBackend(ierr);
777   CeedEvalMode emode;
778   CeedBasis basis;
779   CeedBasis_Cuda_shared *basis_data;
780   CeedElemRestriction Erestrict;
781   CeedElemRestriction_Cuda *restr_data;
782 
783   // Check for restriction only identity operator
784   bool is_identity_qf;
785   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
786   if (is_identity_qf) {
787     CeedEvalMode emodein, emodeout;
788     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
789     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
790     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
791       // LCOV_EXCL_START
792       return CeedError(ceed, CEED_ERROR_BACKEND,
793                        "Backend does not implement restriction only identity operators");
794     // LCOV_EXCL_STOP
795   }
796 
797   ostringstream code;
798   string devFunctions(deviceFunctions);
799 
800   // Add atomicAdd function for old NVidia architectures
801   struct cudaDeviceProp prop;
802   Ceed_Cuda *ceed_data;
803   ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr);
804   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); CeedChkBackend(ierr);
805   if ((prop.major<6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){
806     code << atomicAdd;
807   }
808 
809   code << devFunctions;
810 
811   string qFunction(qf_data->qFunctionSource);
812   string qFunctionName(qf_data->qFunctionName);
813   string oper;
814   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
815 
816   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
817   code << "#define CEED_QFUNCTION_HELPER inline __device__\n";
818   code << "#define CeedPragmaSIMD\n";
819   code << "#define CEED_ERROR_SUCCESS 0\n\n";
820 
821   // Find dim and Q1d
822   bool useCollograd = true;
823   data->maxP1d = 0;
824   for (CeedInt i = 0; i < numinputfields; i++) {
825     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
826     if (basis != CEED_BASIS_COLLOCATED) {
827       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
828       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
829       CeedChkBackend(ierr);
830 
831       // Check for collocated gradient
832       useCollograd = useCollograd && basis_data->d_collograd1d;
833 
834       // Collect dim and Q1d
835       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
836       bool isTensor;
837       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
838       if (isTensor) {
839         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
840         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
841         if (P1d>data->maxP1d) data->maxP1d = P1d;
842       } else {
843         // LCOV_EXCL_START
844         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
845         // LCOV_EXCL_STOP
846         }
847     }
848   }
849   // Check output bases for Q1d, dim as well
850   //   The only imput basis might be CEED_BASIS_COLLOCATED
851   for (CeedInt i = 0; i < numoutputfields; i++) {
852     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
853 
854     if (basis != CEED_BASIS_COLLOCATED) {
855       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
856       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
857       CeedChkBackend(ierr);
858 
859       // Collect dim and Q1d
860       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
861       bool isTensor;
862       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
863       if (isTensor) {
864         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
865       } else {
866         // LCOV_EXCL_START
867         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
868         // LCOV_EXCL_STOP
869         }
870 
871       // Check for collocated gradient
872       useCollograd = useCollograd && basis_data->d_collograd1d;
873     }
874   }
875   data->dim = dim;
876   data->Q1d = Q1d;
877 
878   // Define CEED_Q_VLA
879   if (dim != 3 || useCollograd) {
880     code << "\n#define CEED_Q_VLA 1\n\n";
881   } else {
882     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
883   }
884 
885   code << qFunction;
886 
887   // Setup
888   code << "\n// -----------------------------------------------------------------------------\n";
889   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
890   for (CeedInt i = 0; i < numinputfields; i++) {
891     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
892     CeedChkBackend(ierr);
893     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
894       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
895     }
896   }
897 
898   for (CeedInt i = 0; i < numoutputfields; i++) {
899     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
900   }
901 
902   code << "  const CeedInt Dim = "<<dim<<";\n";
903   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
904 
905   code << "  extern __shared__ CeedScalar slice[];\n";
906   code << "  BackendData data;\n";
907   code << "  data.tidx = threadIdx.x;\n";
908   code << "  data.tidy = threadIdx.y;\n";
909   code << "  data.tidz = threadIdx.z;\n";
910   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
911   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
912 
913   code << "\n  // -- Input field constants and basis data --\n";
914   //Initialize constants, and matrices B and G
915   for (CeedInt i = 0; i < numinputfields; i++) {
916     code << "  // ---- Input field "<<i<<" ----\n";
917     // Get elemsize, emode, ncomp
918     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
919     CeedChkBackend(ierr);
920     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
921     CeedChkBackend(ierr);
922     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
923     CeedChkBackend(ierr);
924     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
925     CeedChkBackend(ierr);
926 
927     // Set field constants
928     if (emode != CEED_EVAL_WEIGHT) {
929       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
930       if (basis != CEED_BASIS_COLLOCATED) {
931         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
932         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
933       } else {
934         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
935       }
936       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
937     }
938 
939     // Load basis data
940     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
941     switch (emode) {
942     case CEED_EVAL_NONE:
943       break;
944     case CEED_EVAL_INTERP:
945       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
946       data->B.in[i] = basis_data->d_interp1d;
947       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
948       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
949       break;
950     case CEED_EVAL_GRAD:
951       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
952       data->B.in[i] = basis_data->d_interp1d;
953       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
954       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
955       if (useCollograd) {
956         data->G.in[i] = basis_data->d_collograd1d;
957         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
958         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
959       } else {
960         data->G.in[i] = basis_data->d_grad1d;
961         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
962         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
963       }
964       break;
965     case CEED_EVAL_WEIGHT:
966       break; // No action
967     case CEED_EVAL_DIV:
968       break; // TODO: Not implemented
969     case CEED_EVAL_CURL:
970       break; // TODO: Not implemented
971     }
972   }
973 
974   code << "\n  // -- Output field constants and basis data --\n";
975   for (CeedInt i = 0; i < numoutputfields; i++) {
976     code << "  // ---- Output field "<<i<<" ----\n";
977     // Get elemsize, emode, ncomp
978     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
979     CeedChkBackend(ierr);
980     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
981     CeedChkBackend(ierr);
982     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
983     CeedChkBackend(ierr);
984     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
985     CeedChkBackend(ierr);
986 
987     // Set field constants
988     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
989     if (basis != CEED_BASIS_COLLOCATED) {
990       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
991       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
992     } else {
993       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
994     }
995     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
996 
997     // Load basis data
998     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
999     switch (emode) {
1000     case CEED_EVAL_NONE:
1001       break; // No action
1002     case CEED_EVAL_INTERP:
1003       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1004       data->B.out[i] = basis_data->d_interp1d;
1005       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1006       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1007       break;
1008     case CEED_EVAL_GRAD:
1009       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1010       data->B.out[i] = basis_data->d_interp1d;
1011       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1012       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1013       if (useCollograd) {
1014         data->G.out[i] = basis_data->d_collograd1d;
1015         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1016         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1017       } else {
1018         data->G.out[i] = basis_data->d_grad1d;
1019         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1020         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1021       }
1022       break;
1023     // LCOV_EXCL_START
1024     case CEED_EVAL_WEIGHT: {
1025       Ceed ceed;
1026       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1027       return CeedError(ceed, CEED_ERROR_BACKEND,
1028                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1029       break; // Should not occur
1030     }
1031     case CEED_EVAL_DIV:
1032       break; // TODO: Not implemented
1033     case CEED_EVAL_CURL:
1034       break; // TODO: Not implemented
1035       // LCOV_EXCL_STOP
1036     }
1037   }
1038   code << "\n  // -- Element loop --\n";
1039   code << "  __syncthreads();\n";
1040   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1041   // Input basis apply if needed
1042   // Generate the correct eval mode code for each input
1043   code << "    // -- Input field restrictions and basis actions --\n";
1044   for (CeedInt i = 0; i < numinputfields; i++) {
1045     code << "    // ---- Input field "<<i<<" ----\n";
1046     // Get elemsize, emode, ncomp
1047     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1048     CeedChkBackend(ierr);
1049     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1050     CeedChkBackend(ierr);
1051     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1052     CeedChkBackend(ierr);
1053     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1054     CeedChkBackend(ierr);
1055 
1056     // Restriction
1057     if (emode != CEED_EVAL_WEIGHT &&
1058         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1059       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1060 
1061       bool isStrided;
1062       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1063       if (!isStrided) {
1064         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1065         CeedChkBackend(ierr);
1066         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1067         CeedInt compstride;
1068         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1069         code << "    // CompStride: "<<compstride<<"\n";
1070         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1071         data->indices.in[i] = restr_data->d_ind;
1072         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1073       } else {
1074         bool backendstrides;
1075         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1076         CeedChkBackend(ierr);
1077         CeedInt nelem;
1078         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1079         CeedChkBackend(ierr);
1080         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1081         if (!backendstrides) {
1082           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1083           CeedChkBackend(ierr);
1084         }
1085         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1086         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1087       }
1088     }
1089 
1090     // Basis action
1091     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1092     switch (emode) {
1093     case CEED_EVAL_NONE:
1094       if (!useCollograd) {
1095         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1096       }
1097       break;
1098     case CEED_EVAL_INTERP:
1099       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1100       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1101       break;
1102     case CEED_EVAL_GRAD:
1103       if (useCollograd) {
1104         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1105         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1106       } else {
1107         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1108         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1109       }
1110       break;
1111     case CEED_EVAL_WEIGHT:
1112       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1113       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1114       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1115       data->W = basis_data->d_qweight1d;
1116       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1117       break; // No action
1118     case CEED_EVAL_DIV:
1119       break; // TODO: Not implemented
1120     case CEED_EVAL_CURL:
1121       break; // TODO: Not implemented
1122     }
1123   }
1124 
1125   // Q function
1126   code << "\n    // -- Output field setup --\n";
1127   for (CeedInt i = 0; i < numoutputfields; i++) {
1128       code << "\n    // ---- Output field "<<i<<" ----\n";
1129     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1130     CeedChkBackend(ierr);
1131     if (emode==CEED_EVAL_GRAD)
1132     {
1133       if (useCollograd) {
1134         //Accumulator for gradient slices
1135         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1136         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1137         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1138         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1139         code << "      }\n";
1140         code << "    }\n";
1141       } else {
1142         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1143       }
1144     }
1145     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1146     {
1147       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1148     }
1149   }
1150   // We treat quadrature points per slice in 3d to save registers
1151   if (useCollograd) {
1152     code << "\n    // Note: Collocated Gradient\n";
1153     code << "#pragma unroll\n";
1154     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1155     code << "      // -- Input fields --\n";
1156     for (CeedInt i = 0; i < numinputfields; i++) {
1157       code << "      // ---- Input field "<<i<<" ----\n";
1158       // Get elemsize, emode, ncomp
1159       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1160       CeedChkBackend(ierr);
1161       // Basis action
1162       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1163       switch (emode) {
1164       case CEED_EVAL_NONE:
1165         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1166 
1167         bool isStrided;
1168         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1169         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1170         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1171         if (!isStrided) {
1172           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1173           CeedChkBackend(ierr);
1174           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1175           CeedInt compstride;
1176           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1177           code << "      // CompStride: "<<compstride<<"\n";
1178           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1179           data->indices.in[i] = restr_data->d_ind;
1180           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1181         } else {
1182           bool backendstrides;
1183           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1184           CeedChkBackend(ierr);
1185           CeedInt nelem;
1186           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1187           CeedChkBackend(ierr);
1188           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1189           if (!backendstrides) {
1190             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1191             CeedChkBackend(ierr);
1192           }
1193           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1194           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1195         }
1196         break;
1197       case CEED_EVAL_INTERP:
1198         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1199         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1200         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1201         code << "      }\n";
1202         break;
1203       case CEED_EVAL_GRAD:
1204         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1205         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1206         break;
1207       case CEED_EVAL_WEIGHT:
1208         code << "      CeedScalar r_q"<<i<<"[1];\n";
1209         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1210         break; // No action
1211       case CEED_EVAL_DIV:
1212         break; // TODO: Not implemented
1213       case CEED_EVAL_CURL:
1214         break; // TODO: Not implemented
1215       }
1216     }
1217     code << "\n      // -- Output fields --\n";
1218     for (CeedInt i = 0; i < numoutputfields; i++) {
1219       code << "      // ---- Output field "<<i<<" ----\n";
1220       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1221       CeedChkBackend(ierr);
1222       // Basis action
1223       switch (emode) {
1224       case CEED_EVAL_NONE:
1225         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1226         break; // No action
1227       case CEED_EVAL_INTERP:
1228         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1229         break;
1230       case CEED_EVAL_GRAD:
1231         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1232         break;
1233       case CEED_EVAL_WEIGHT:
1234         break; // Should not occur
1235       case CEED_EVAL_DIV:
1236         break; // TODO: Not implemented
1237       case CEED_EVAL_CURL:
1238         break; // TODO: Not implemented
1239       }
1240     }
1241   } else {
1242     code << "\n      // Note: No Collocated Gradient\n";
1243     code << "      // -- Input fields --\n";
1244     for (CeedInt i = 0; i < numinputfields; i++) {
1245       code << "      // ---- Input field "<<i<<" ----\n";
1246       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1247     }
1248     code << "      // -- Output fields --\n";
1249     for (CeedInt i = 0; i < numoutputfields; i++) {
1250       code << "      // ---- Output field "<<i<<" ----\n";
1251       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1252     }
1253   }
1254   code << "\n      // -- QFunction Inputs and outputs --\n";
1255   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1256   for (CeedInt i = 0; i < numinputfields; i++) {
1257     code << "      // ---- Input field "<<i<<" ----\n";
1258     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1259   }
1260   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1261   for (CeedInt i = 0; i < numoutputfields; i++) {
1262     code << "      // ---- Output field "<<i<<" ----\n";
1263     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1264   }
1265   code << "\n      // -- Apply QFunction --\n";
1266   code << "      "<<qFunctionName<<"(ctx, ";
1267   if (dim != 3 || useCollograd) {
1268     code << "1";
1269   } else {
1270     code << "Q1d";
1271   }
1272   code << ", in, out);\n";
1273   if (useCollograd) {
1274     code << "\n      // Note: Collocated Gradient\n";
1275     code << "      // -- Output fields --\n";
1276     for (CeedInt i = 0; i < numoutputfields; i++) {
1277       code << "      // ---- Output field "<<i<<" ----\n";
1278       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1279       CeedChkBackend(ierr);
1280       // Basis action
1281       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1282       switch (emode) {
1283       case CEED_EVAL_NONE:
1284         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1285         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1286         code << "      }\n";
1287         break; // No action
1288       case CEED_EVAL_INTERP:
1289         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1290         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1291         code << "      }\n";
1292         break;
1293       case CEED_EVAL_GRAD:
1294         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1295         break;
1296       case CEED_EVAL_WEIGHT:
1297         break; // Should not occur
1298       case CEED_EVAL_DIV:
1299         break; // TODO: Not implemented
1300       case CEED_EVAL_CURL:
1301         break; // TODO: Not implemented
1302       }
1303     }
1304     code << "    }\n";
1305   }
1306 
1307   // Output basis apply if needed
1308   // Generate the correct eval mode code for each output
1309   code << "\n    // -- Output field basis action and restrictions --\n";
1310   for (CeedInt i = 0; i < numoutputfields; i++) {
1311     code << "    // ---- Output field "<<i<<" ----\n";
1312     // Get elemsize, emode, ncomp
1313     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1314     CeedChkBackend(ierr);
1315     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1316     CeedChkBackend(ierr);
1317     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1318     CeedChkBackend(ierr);
1319     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1320     CeedChkBackend(ierr);
1321     // Basis action
1322     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1323     switch (emode) {
1324     case CEED_EVAL_NONE:
1325       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1326       break; // No action
1327     case CEED_EVAL_INTERP:
1328       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1329       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1330       break;
1331     case CEED_EVAL_GRAD:
1332       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1333       if (useCollograd) {
1334         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1335       } else {
1336         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1337       }
1338       break;
1339     // LCOV_EXCL_START
1340     case CEED_EVAL_WEIGHT: {
1341       Ceed ceed;
1342       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1343       return CeedError(ceed, CEED_ERROR_BACKEND,
1344                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1345       break; // Should not occur
1346     }
1347     case CEED_EVAL_DIV:
1348       break; // TODO: Not implemented
1349     case CEED_EVAL_CURL:
1350       break; // TODO: Not implemented
1351       // LCOV_EXCL_STOP
1352     }
1353     // Restriction
1354       bool isStrided;
1355       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1356     if (!isStrided) {
1357       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1358       CeedChkBackend(ierr);
1359       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1360       CeedInt compstride;
1361       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1362       code << "    // CompStride: "<<compstride<<"\n";
1363       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1364       data->indices.out[i] = restr_data->d_ind;
1365       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1366     } else {
1367       bool backendstrides;
1368       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1369       CeedChkBackend(ierr);
1370       CeedInt nelem;
1371       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1372       CeedChkBackend(ierr);
1373       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1374       if (!backendstrides) {
1375         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1376         CeedChkBackend(ierr);
1377       }
1378       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1379       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1380     }
1381   }
1382 
1383   code << "  }\n";
1384   code << "}\n";
1385   code << "// -----------------------------------------------------------------------------\n\n";
1386 
1387   // View kernel for debugging
1388   CeedDebug(code.str().c_str());
1389 
1390   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
1391                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1392   CeedChkBackend(ierr);
1393   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1394   CeedChkBackend(ierr);
1395 
1396   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1397   return CEED_ERROR_SUCCESS;
1398 }
1399 //------------------------------------------------------------------------------
1400