xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision f4f98f9d61969ce0e2a6a2f6adc6df63e4d4c354)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #define CEED_DEBUG_COLOR 12
18 
19 #include <ceed/ceed.h>
20 #include <ceed/backend.h>
21 #include <cuda_runtime.h>
22 #include <iostream>
23 #include <sstream>
24 #include "ceed-cuda-gen.h"
25 #include "../cuda/ceed-cuda-compile.h"
26 #include "../cuda-ref/ceed-cuda-ref.h"
27 #include "../cuda-shared/ceed-cuda-shared.h"
28 
29 static const char *atomicAdd = QUOTE(
30 //------------------------------------------------------------------------------
31 // Atomic add, for older CUDA
32 //------------------------------------------------------------------------------
33 __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) {
34   unsigned long long int *address_as_ull = (unsigned long long int *)address;
35   unsigned long long int old = *address_as_ull, assumed;
36   do {
37     assumed = old;
38     old =
39       atomicCAS(address_as_ull, assumed,
40                 __double_as_longlong(val +
41                                     __longlong_as_double(assumed)));
42     // Note: uses integer comparison to avoid hang in case of NaN
43     // (since NaN != NaN)
44   } while (assumed != old);
45   return __longlong_as_double(old);
46 }
47 );
48 
49 static const char *deviceFunctions = QUOTE(
50 
51 //------------------------------------------------------------------------------
52 // Typedefs
53 //------------------------------------------------------------------------------
54 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
55 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
56 
57 typedef struct {
58   CeedInt tidx;
59   CeedInt tidy;
60   CeedInt tidz;
61   CeedInt tid;
62   CeedScalar* slice;
63 } BackendData;
64 
65 //------------------------------------------------------------------------------
66 // Load matrices for basis actions
67 //------------------------------------------------------------------------------
68 template <int P, int Q>
69 inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
70   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
71     B[i] = d_B[i];
72 }
73 
74 //------------------------------------------------------------------------------
75 // 1D
76 //------------------------------------------------------------------------------
77 
78 //------------------------------------------------------------------------------
79 // L-vector -> E-vector, offsets provided
80 //------------------------------------------------------------------------------
81 template <int NCOMP, int COMPSTRIDE, int P1d>
82 inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
83   if (data.tidx < P1d) {
84     const CeedInt node = data.tidx;
85     const CeedInt ind = indices[node + elem * P1d];
86     for (CeedInt comp = 0; comp < NCOMP; ++comp)
87       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // L-vector -> E-vector, strided
93 //------------------------------------------------------------------------------
94 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
95 inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
96   if (data.tidx < P1d) {
97     const CeedInt node = data.tidx;
98     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
99     for (CeedInt comp = 0; comp < NCOMP; ++comp)
100       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
101   }
102 }
103 
104 //------------------------------------------------------------------------------
105 // E-vector -> L-vector, offsets provided
106 //------------------------------------------------------------------------------
107 template <int NCOMP, int COMPSTRIDE, int P1d>
108 inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
109   if (data.tidx < P1d) {
110     const CeedInt node = data.tidx;
111     const CeedInt ind = indices[node + elem * P1d];
112     for (CeedInt comp = 0; comp < NCOMP; ++comp)
113       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
114   }
115 }
116 
117 //------------------------------------------------------------------------------
118 // E-vector -> L-vector, strided
119 //------------------------------------------------------------------------------
120 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
121 inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
122   if (data.tidx < P1d) {
123     const CeedInt node = data.tidx;
124     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
125     for (CeedInt comp = 0; comp < NCOMP; ++comp)
126       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
127   }
128 }
129 
130 //------------------------------------------------------------------------------
131 // 1D tensor contraction x
132 //------------------------------------------------------------------------------
133 template <int NCOMP, int P1d, int Q1d>
134 inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
135   data.slice[data.tidx] = *U;
136   __syncthreads();
137   *V = 0.0;
138   if (data.tidx < Q1d)
139     for (CeedInt i = 0; i < P1d; ++i)
140       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
141   __syncthreads();
142 }
143 
144 //------------------------------------------------------------------------------
145 // 1D transpose tensor contraction x
146 //------------------------------------------------------------------------------
147 template <int NCOMP, int P1d, int Q1d>
148 inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
149   data.slice[data.tidx] = *U;
150   __syncthreads();
151   *V = 0.0;
152   if (data.tidx < P1d)
153     for (CeedInt i = 0; i < Q1d; ++i)
154       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
155   __syncthreads();
156 }
157 
158 //------------------------------------------------------------------------------
159 // 1D interpolate to quadrature points
160 //------------------------------------------------------------------------------
161 template <int NCOMP, int P1d, int Q1d>
162 inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
163   for (CeedInt comp = 0; comp < NCOMP; comp++)
164     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
165 }
166 
167 //------------------------------------------------------------------------------
168 // 1D interpolate transpose
169 //------------------------------------------------------------------------------
170 template <int NCOMP, int P1d, int Q1d>
171 inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
172   for (CeedInt comp=0; comp<NCOMP; comp++)
173     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
174 }
175 
176 //------------------------------------------------------------------------------
177 // 1D derivatives at quadrature points
178 //------------------------------------------------------------------------------
179 template <int NCOMP, int P1d, int Q1d>
180 inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
181   for (CeedInt comp = 0; comp < NCOMP; comp++)
182     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
183 }
184 
185 //------------------------------------------------------------------------------
186 // 1D derivatives transpose
187 //------------------------------------------------------------------------------
188 template <int NCOMP, int P1d, int Q1d>
189 inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
190   for (CeedInt comp = 0; comp < NCOMP; comp++)
191     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
192 }
193 
194 //------------------------------------------------------------------------------
195 // 2D
196 //------------------------------------------------------------------------------
197 
198 //------------------------------------------------------------------------------
199 // L-vector -> E-vector, offsets provided
200 //------------------------------------------------------------------------------
201 template <int NCOMP, int COMPSTRIDE, int P1d>
202 inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
203   if (data.tidx < P1d && data.tidy < P1d) {
204     const CeedInt node = data.tidx + data.tidy*P1d;
205     const CeedInt ind = indices[node + elem * P1d*P1d];
206     for (CeedInt comp = 0; comp < NCOMP; ++comp)
207       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
208   }
209 }
210 
211 //------------------------------------------------------------------------------
212 // L-vector -> E-vector, strided
213 //------------------------------------------------------------------------------
214 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
215 inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
216   if (data.tidx < P1d && data.tidy < P1d) {
217     const CeedInt node = data.tidx + data.tidy*P1d;
218     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
219     for (CeedInt comp = 0; comp < NCOMP; ++comp)
220       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
221   }
222 }
223 
224 //------------------------------------------------------------------------------
225 // E-vector -> L-vector, offsets provided
226 //------------------------------------------------------------------------------
227 template <int NCOMP, int COMPSTRIDE, int P1d>
228 inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
229   if (data.tidx < P1d && data.tidy < P1d) {
230     const CeedInt node = data.tidx + data.tidy*P1d;
231     const CeedInt ind = indices[node + elem * P1d*P1d];
232     for (CeedInt comp = 0; comp < NCOMP; ++comp)
233       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
234   }
235 }
236 
237 //------------------------------------------------------------------------------
238 // E-vector -> L-vector, strided
239 //------------------------------------------------------------------------------
240 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
241 inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
242   if (data.tidx < P1d && data.tidy < P1d) {
243     const CeedInt node = data.tidx + data.tidy*P1d;
244     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
245     for (CeedInt comp = 0; comp < NCOMP; ++comp)
246       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
247   }
248 }
249 
250 //------------------------------------------------------------------------------
251 // 2D tensor contraction x
252 //------------------------------------------------------------------------------
253 template <int NCOMP, int P1d, int Q1d>
254 inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
255   data.slice[data.tidx+data.tidy*T1d] = *U;
256   __syncthreads();
257   *V = 0.0;
258   if (data.tidx < Q1d && data.tidy < P1d)
259     for (CeedInt i = 0; i < P1d; ++i)
260       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
261   __syncthreads();
262 }
263 
264 //------------------------------------------------------------------------------
265 // 2D tensor contract y
266 //------------------------------------------------------------------------------
267 template <int NCOMP, int P1d, int Q1d>
268 inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
269   data.slice[data.tidx+data.tidy*T1d] = *U;
270   __syncthreads();
271   *V = 0.0;
272   if (data.tidx < Q1d && data.tidy < Q1d)
273     for (CeedInt i = 0; i < P1d; ++i)
274       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
275   __syncthreads();
276 }
277 
278 //------------------------------------------------------------------------------
279 // 2D transpose tensor contract y
280 //------------------------------------------------------------------------------
281 template <int NCOMP, int P1d, int Q1d>
282 inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
283   data.slice[data.tidx+data.tidy*T1d] = *U;
284   __syncthreads();
285   *V = 0.0;
286   if (data.tidx < Q1d && data.tidy < P1d)
287     for (CeedInt i = 0; i < Q1d; ++i)
288       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
289   __syncthreads();
290 }
291 
292 //------------------------------------------------------------------------------
293 // 2D transpose tensor contract x
294 //------------------------------------------------------------------------------
295 template <int NCOMP, int P1d, int Q1d>
296 inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
297   data.slice[data.tidx+data.tidy*T1d] = *U;
298   __syncthreads();
299   *V = 0.0;
300   if (data.tidx < P1d && data.tidy < P1d)
301     for (CeedInt i = 0; i < Q1d; ++i)
302       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
303   __syncthreads();
304 }
305 
306 //------------------------------------------------------------------------------
307 // 2D transpose tensor contract and add x
308 //------------------------------------------------------------------------------
309 template <int NCOMP, int P1d, int Q1d>
310 inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
311   data.slice[data.tidx+data.tidy*T1d] = *U;
312   __syncthreads();
313   if (data.tidx < P1d && data.tidy < P1d)
314     for (CeedInt i = 0; i < Q1d; ++i)
315       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
316   __syncthreads();
317 }
318 
319 //------------------------------------------------------------------------------
320 // 2D interpolate to quadrature points
321 //------------------------------------------------------------------------------
322 template <int NCOMP, int P1d, int Q1d>
323 inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
324   CeedScalar r_t[1];
325   for (CeedInt comp = 0; comp < NCOMP; comp++) {
326     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
327     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
328   }
329 }
330 
331 //------------------------------------------------------------------------------
332 // 2D interpolate transpose
333 //------------------------------------------------------------------------------
334 template <int NCOMP, int P1d, int Q1d>
335 inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
336   CeedScalar r_t[1];
337   for (CeedInt comp = 0; comp < NCOMP; comp++) {
338     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
339     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
340   }
341 }
342 
343 //------------------------------------------------------------------------------
344 // 2D derivatives at quadrature points
345 //------------------------------------------------------------------------------
346 template <int NCOMP, int P1d, int Q1d>
347 inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
348   CeedScalar r_t[1];
349   for (CeedInt comp = 0; comp < NCOMP; comp++) {
350     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
351     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
352     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
353     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
354   }
355 }
356 
357 //------------------------------------------------------------------------------
358 // 2D derivatives transpose
359 //------------------------------------------------------------------------------
360 template <int NCOMP, int P1d, int Q1d>
361 inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
362   CeedScalar r_t[1];
363   for (CeedInt comp = 0; comp < NCOMP; comp++) {
364     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
365     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
366     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
367     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
368   }
369 }
370 
371 //------------------------------------------------------------------------------
372 // 3D
373 //------------------------------------------------------------------------------
374 
375 //------------------------------------------------------------------------------
376 // L-vector -> E-vector, offsets provided
377 //------------------------------------------------------------------------------
378 template <int NCOMP, int COMPSTRIDE, int P1d>
379 inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
380   if (data.tidx < P1d && data.tidy < P1d)
381     for (CeedInt z = 0; z < P1d; ++z) {
382       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
383       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
384       for (CeedInt comp = 0; comp < NCOMP; ++comp)
385         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
386     }
387 }
388 
389 //------------------------------------------------------------------------------
390 // L-vector -> E-vector, strided
391 //------------------------------------------------------------------------------
392 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
393 inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
394   if (data.tidx < P1d && data.tidy < P1d)
395     for (CeedInt z = 0; z < P1d; ++z) {
396       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
397       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
398       for (CeedInt comp = 0; comp < NCOMP; ++comp)
399         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
400     }
401 }
402 
403 //------------------------------------------------------------------------------
404 // E-vector -> Q-vector, offests provided
405 //------------------------------------------------------------------------------
406 template <int NCOMP, int COMPSTRIDE, int Q1d>
407 inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
408   if (data.tidx < Q1d && data.tidy < Q1d) {
409     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
410     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
411     for (CeedInt comp = 0; comp < NCOMP; ++comp)
412       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
413   }
414 }
415 
416 //------------------------------------------------------------------------------
417 // E-vector -> Q-vector, strided
418 //------------------------------------------------------------------------------
419 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
420 inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
421   if (data.tidx < Q1d && data.tidy < Q1d) {
422     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
423     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
424     for (CeedInt comp = 0; comp < NCOMP; ++comp)
425       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
426   }
427 }
428 
429 //------------------------------------------------------------------------------
430 // E-vector -> L-vector, offsets provided
431 //------------------------------------------------------------------------------
432 template <int NCOMP, int COMPSTRIDE, int P1d>
433 inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
434   if (data.tidx < P1d && data.tidy < P1d)
435     for (CeedInt z = 0; z < P1d; ++z) {
436       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
437       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
438       for (CeedInt comp = 0; comp < NCOMP; ++comp)
439         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
440     }
441 }
442 
443 //------------------------------------------------------------------------------
444 // E-vector -> L-vector, strided
445 //------------------------------------------------------------------------------
446 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
447 inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
448   if (data.tidx < P1d && data.tidy < P1d)
449     for (CeedInt z = 0; z < P1d; ++z) {
450       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
451       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
452       for (CeedInt comp = 0; comp < NCOMP; ++comp)
453         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
454     }
455 }
456 
457 //------------------------------------------------------------------------------
458 // 3D tensor contract x
459 //------------------------------------------------------------------------------
460 template <int NCOMP, int P1d, int Q1d>
461 inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
462   CeedScalar r_B[P1d];
463   for (CeedInt i = 0; i < P1d; ++i)
464     r_B[i] = B[i + data.tidx*P1d];
465 
466   for (CeedInt k = 0; k < P1d; ++k) {
467     data.slice[data.tidx+data.tidy*T1d] = U[k];
468     __syncthreads();
469     V[k] = 0.0;
470     if (data.tidx < Q1d && data.tidy < P1d)
471       for (CeedInt i = 0; i < P1d; ++i)
472         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
473     __syncthreads();
474   }
475 }
476 
477 //------------------------------------------------------------------------------
478 // 3D tensor contract y
479 //------------------------------------------------------------------------------
480 template <int NCOMP, int P1d, int Q1d>
481 inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
482   CeedScalar r_B[P1d];
483   for (CeedInt i = 0; i < P1d; ++i)
484     r_B[i] = B[i + data.tidy*P1d];
485 
486   for (CeedInt k = 0; k < P1d; ++k) {
487     data.slice[data.tidx+data.tidy*T1d] = U[k];
488     __syncthreads();
489     V[k] = 0.0;
490     if (data.tidx < Q1d && data.tidy < Q1d)
491       for (CeedInt i = 0; i < P1d; ++i)
492         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
493     __syncthreads();
494   }
495 }
496 
497 //------------------------------------------------------------------------------
498 // 3D tensor contract z
499 //------------------------------------------------------------------------------
500 template <int NCOMP, int P1d, int Q1d>
501 inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
502   for (CeedInt k = 0; k < Q1d; ++k) {
503     V[k] = 0.0;
504     if (data.tidx < Q1d && data.tidy < Q1d)
505       for (CeedInt i = 0; i < P1d; ++i)
506         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
507   }
508 }
509 
510 //------------------------------------------------------------------------------
511 // 3D transpose tensor contract z
512 //------------------------------------------------------------------------------
513 template <int NCOMP, int P1d, int Q1d>
514 inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
515   for (CeedInt k = 0; k < P1d; ++k) {
516     V[k] = 0.0;
517     if (data.tidx < Q1d && data.tidy < Q1d)
518       for (CeedInt i = 0; i < Q1d; ++i)
519         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
520   }
521 }
522 
523 //------------------------------------------------------------------------------
524 // 3D transpose tensor contract y
525 //------------------------------------------------------------------------------
526 template <int NCOMP, int P1d, int Q1d>
527 inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
528   CeedScalar r_B[Q1d];
529   for (CeedInt i = 0; i < Q1d; ++i)
530     r_B[i] = B[data.tidy + i*P1d];
531 
532   for (CeedInt k = 0; k < P1d; ++k) {
533     data.slice[data.tidx+data.tidy*T1d] = U[k];
534     __syncthreads();
535     V[k] = 0.0;
536     if (data.tidx < Q1d && data.tidy < P1d)
537       for (CeedInt i = 0; i < Q1d; ++i)
538         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
539     __syncthreads();
540   }
541 }
542 
543 //------------------------------------------------------------------------------
544 // 3D transpose tensor contract add y
545 //------------------------------------------------------------------------------
546 template <int NCOMP, int P1d, int Q1d>
547 inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
548   CeedScalar r_B[Q1d];
549   for (CeedInt i = 0; i < Q1d; ++i)
550     r_B[i] = B[data.tidy + i*P1d];
551 
552   for (CeedInt k = 0; k < P1d; ++k) {
553     data.slice[data.tidx+data.tidy*T1d] = U[k];
554     __syncthreads();
555     if (data.tidx < Q1d && data.tidy < P1d)
556       for (CeedInt i = 0; i < Q1d; ++i)
557         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
558     __syncthreads();
559   }
560 }
561 
562 //------------------------------------------------------------------------------
563 // 3D transpose tensor contract x
564 //------------------------------------------------------------------------------
565 template <int NCOMP, int P1d, int Q1d>
566 inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
567   CeedScalar r_B[Q1d];
568   for (CeedInt i = 0; i < Q1d; ++i)
569     r_B[i] = B[data.tidx + i*P1d];
570 
571   for (CeedInt k = 0; k < P1d; ++k) {
572     data.slice[data.tidx+data.tidy*T1d] = U[k];
573     __syncthreads();
574     V[k] = 0.0;
575     if (data.tidx < P1d && data.tidy < P1d)
576       for (CeedInt i = 0; i < Q1d; ++i)
577         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
578     __syncthreads();
579   }
580 }
581 
582 //------------------------------------------------------------------------------
583 // 3D transpose tensor contract add x
584 //------------------------------------------------------------------------------
585 template <int NCOMP, int P1d, int Q1d>
586 inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
587   CeedScalar r_B[Q1d];
588   for (CeedInt i = 0; i < Q1d; ++i)
589     r_B[i] = B[data.tidx + i*P1d];
590 
591   for (CeedInt k = 0; k < P1d; ++k) {
592     data.slice[data.tidx+data.tidy*T1d] = U[k];
593     __syncthreads();
594     if (data.tidx < P1d && data.tidy < P1d)
595       for (CeedInt i = 0; i < Q1d; ++i)
596         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
597     __syncthreads();
598   }
599 }
600 
601 //------------------------------------------------------------------------------
602 // 3D interpolate to quadrature points
603 //------------------------------------------------------------------------------
604 template <int NCOMP, int P1d, int Q1d>
605 inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
606   CeedScalar r_t1[T1d];
607   CeedScalar r_t2[T1d];
608   for (CeedInt comp = 0; comp < NCOMP; comp++) {
609     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
610     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
611     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
612   }
613 }
614 
615 //------------------------------------------------------------------------------
616 // 3D interpolate transpose
617 //------------------------------------------------------------------------------
618 template <int NCOMP, int P1d, int Q1d>
619 inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
620   CeedScalar r_t1[T1d];
621   CeedScalar r_t2[T1d];
622   for (CeedInt comp = 0; comp < NCOMP; comp++) {
623     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
624     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
625     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
626   }
627 }
628 
629 //------------------------------------------------------------------------------
630 // 3D derivatives at quadrature points
631 //------------------------------------------------------------------------------
632 template <int NCOMP, int P1d, int Q1d>
633 inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
634   CeedScalar r_t1[T1d];
635   CeedScalar r_t2[T1d];
636   for (CeedInt comp = 0; comp < NCOMP; comp++) {
637     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
638     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
639     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
640     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
641     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
642     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
643     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
644     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
645     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
646   }
647 }
648 
649 //------------------------------------------------------------------------------
650 // 3D derivatives transpose
651 //------------------------------------------------------------------------------
652 template <int NCOMP, int P1d, int Q1d>
653 inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
654   CeedScalar r_t1[T1d];
655   CeedScalar r_t2[T1d];
656   for (CeedInt comp = 0; comp < NCOMP; comp++) {
657     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
658     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
659     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
660     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
661     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
662     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
663     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
664     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
665     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
666   }
667 }
668 
669 //------------------------------------------------------------------------------
670 // 3D collocated derivatives computation
671 //------------------------------------------------------------------------------
672 template <int NCOMP, int Q1d>
673 inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
674   if (data.tidx < Q1d && data.tidy < Q1d) {
675     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
676       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
677       __syncthreads();
678       // X derivative
679       r_V[comp+0*NCOMP] = 0.0;
680       for (CeedInt i = 0; i < Q1d; ++i)
681         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
682       // Y derivative
683       r_V[comp+1*NCOMP] = 0.0;
684       for (CeedInt i = 0; i < Q1d; ++i)
685         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
686       // Z derivative
687       r_V[comp+2*NCOMP] = 0.0;
688       for (CeedInt i = 0; i < Q1d; ++i)
689         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
690       __syncthreads();
691     }
692   }
693 }
694 
695 //------------------------------------------------------------------------------
696 // 3D collocated derivatives transpose
697 //------------------------------------------------------------------------------
698 template <int NCOMP, int Q1d>
699 inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
700   if (data.tidx < Q1d && data.tidy < Q1d) {
701     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
702       // X derivative
703       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
704       __syncthreads();
705       for (CeedInt i = 0; i < Q1d; ++i)
706         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
707       __syncthreads();
708       // Y derivative
709       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
710       __syncthreads();
711       for (CeedInt i = 0; i < Q1d; ++i)
712         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
713       __syncthreads();
714       // Z derivative
715       for (CeedInt i = 0; i < Q1d; ++i)
716         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
717     }
718   }
719 }
720 
721 //------------------------------------------------------------------------------
722 // 1D quadrature weights
723 //------------------------------------------------------------------------------
724 template <int Q1d>
725 inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
726   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
727 }
728 
729 //------------------------------------------------------------------------------
730 // 2D quadrature weights
731 //------------------------------------------------------------------------------
732 template <int Q1d>
733 inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
734   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
735         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
736 }
737 
738 //------------------------------------------------------------------------------
739 // 3D quadrature weights
740 //------------------------------------------------------------------------------
741 template <int Q1d>
742 inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
743   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
744   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
745   for (CeedInt z = 0; z < Q1d; ++z)
746     w[z] = quad ? pw*qweight1d[z] : 0.0;
747 }
748 
749 );
750 //------------------------------------------------------------------------------
751 // Build singe operator kernel
752 //------------------------------------------------------------------------------
753 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
754 
755   using std::ostringstream;
756   using std::string;
757   int ierr;
758   bool setupdone;
759   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
760   if (setupdone) return CEED_ERROR_SUCCESS;
761   Ceed ceed;
762   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
763   CeedOperator_Cuda_gen *data;
764   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
765   CeedQFunction qf;
766   CeedQFunction_Cuda_gen *qf_data;
767   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
768   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
769   CeedSize lsize;
770   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
771           numoutputfields, ncomp, dim = 0;
772   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
773   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
774   CeedOperatorField *opinputfields, *opoutputfields;
775   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
776   CeedChkBackend(ierr);
777   CeedQFunctionField *qfinputfields, *qfoutputfields;
778   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
779   CeedChkBackend(ierr);
780   CeedEvalMode emode;
781   CeedBasis basis;
782   CeedBasis_Cuda_shared *basis_data;
783   CeedElemRestriction Erestrict;
784   CeedElemRestriction_Cuda *restr_data;
785 
786   // Check for restriction only identity operator
787   bool is_identity_qf;
788   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
789   if (is_identity_qf) {
790     CeedEvalMode emodein, emodeout;
791     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
792     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
793     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
794       // LCOV_EXCL_START
795       return CeedError(ceed, CEED_ERROR_BACKEND,
796                        "Backend does not implement restriction only identity operators");
797     // LCOV_EXCL_STOP
798   }
799 
800   ostringstream code;
801   string devFunctions(deviceFunctions);
802 
803   // Add atomicAdd function for old NVidia architectures
804   struct cudaDeviceProp prop;
805   Ceed_Cuda *ceed_data;
806   ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr);
807   ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr);
808   if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){
809     code << atomicAdd;
810   }
811 
812   code << devFunctions;
813 
814   string qFunction(qf_data->qFunctionSource);
815   string qFunctionName(qf_data->qFunctionName);
816   string oper;
817   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
818 
819   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
820   code << "#define CEED_QFUNCTION_HELPER inline __device__\n";
821   code << "#define CeedPragmaSIMD\n";
822   code << "#define CEED_ERROR_SUCCESS 0\n\n";
823 
824   // Find dim and Q1d
825   bool useCollograd = true;
826   data->maxP1d = 0;
827   for (CeedInt i = 0; i < numinputfields; i++) {
828     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
829     if (basis != CEED_BASIS_COLLOCATED) {
830       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
831       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
832       CeedChkBackend(ierr);
833 
834       // Check for collocated gradient
835       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
836 
837       // Collect dim and Q1d
838       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
839       bool isTensor;
840       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
841       if (isTensor) {
842         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
843         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
844         if (P1d>data->maxP1d) data->maxP1d = P1d;
845       } else {
846         // LCOV_EXCL_START
847         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
848         // LCOV_EXCL_STOP
849         }
850     }
851   }
852   // Check output bases for Q1d, dim as well
853   //   The only imput basis might be CEED_BASIS_COLLOCATED
854   for (CeedInt i = 0; i < numoutputfields; i++) {
855     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
856 
857     if (basis != CEED_BASIS_COLLOCATED) {
858       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
859       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
860       CeedChkBackend(ierr);
861 
862       // Collect dim and Q1d
863       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
864       bool isTensor;
865       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
866       if (isTensor) {
867         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
868       } else {
869         // LCOV_EXCL_START
870         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
871         // LCOV_EXCL_STOP
872         }
873 
874       // Check for collocated gradient
875       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
876     }
877   }
878   data->dim = dim;
879   data->Q1d = Q1d;
880 
881   // Define CEED_Q_VLA
882   if (dim != 3 || useCollograd) {
883     code << "\n#define CEED_Q_VLA 1\n\n";
884   } else {
885     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
886   }
887 
888   code << qFunction;
889 
890   // Setup
891   code << "\n// -----------------------------------------------------------------------------\n";
892   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
893   for (CeedInt i = 0; i < numinputfields; i++) {
894     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
895     CeedChkBackend(ierr);
896     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
897       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
898     }
899   }
900 
901   for (CeedInt i = 0; i < numoutputfields; i++) {
902     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
903   }
904 
905   code << "  const CeedInt Dim = "<<dim<<";\n";
906   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
907 
908   code << "  extern __shared__ CeedScalar slice[];\n";
909   code << "  BackendData data;\n";
910   code << "  data.tidx = threadIdx.x;\n";
911   code << "  data.tidy = threadIdx.y;\n";
912   code << "  data.tidz = threadIdx.z;\n";
913   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
914   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
915 
916   code << "\n  // -- Input field constants and basis data --\n";
917   //Initialize constants, and matrices B and G
918   for (CeedInt i = 0; i < numinputfields; i++) {
919     code << "  // ---- Input field "<<i<<" ----\n";
920     // Get elemsize, emode, ncomp
921     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
922     CeedChkBackend(ierr);
923     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
924     CeedChkBackend(ierr);
925     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
926     CeedChkBackend(ierr);
927     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
928     CeedChkBackend(ierr);
929 
930     // Set field constants
931     if (emode != CEED_EVAL_WEIGHT) {
932       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
933       if (basis != CEED_BASIS_COLLOCATED) {
934         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
935         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
936       } else {
937         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
938       }
939       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
940     }
941 
942     // Load basis data
943     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
944     switch (emode) {
945     case CEED_EVAL_NONE:
946       break;
947     case CEED_EVAL_INTERP:
948       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
949       data->B.in[i] = basis_data->d_interp_1d;
950       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
951       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
952       break;
953     case CEED_EVAL_GRAD:
954       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
955       data->B.in[i] = basis_data->d_interp_1d;
956       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
957       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
958       if (useCollograd) {
959         data->G.in[i] = basis_data->d_collo_grad_1d;
960         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
961         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
962       } else {
963         data->G.in[i] = basis_data->d_grad_1d;
964         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
965         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
966       }
967       break;
968     case CEED_EVAL_WEIGHT:
969       break; // No action
970     case CEED_EVAL_DIV:
971       break; // TODO: Not implemented
972     case CEED_EVAL_CURL:
973       break; // TODO: Not implemented
974     }
975   }
976 
977   code << "\n  // -- Output field constants and basis data --\n";
978   for (CeedInt i = 0; i < numoutputfields; i++) {
979     code << "  // ---- Output field "<<i<<" ----\n";
980     // Get elemsize, emode, ncomp
981     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
982     CeedChkBackend(ierr);
983     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
984     CeedChkBackend(ierr);
985     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
986     CeedChkBackend(ierr);
987     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
988     CeedChkBackend(ierr);
989 
990     // Set field constants
991     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
992     if (basis != CEED_BASIS_COLLOCATED) {
993       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
994       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
995     } else {
996       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
997     }
998     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
999 
1000     // Load basis data
1001     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1002     switch (emode) {
1003     case CEED_EVAL_NONE:
1004       break; // No action
1005     case CEED_EVAL_INTERP:
1006       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1007       data->B.out[i] = basis_data->d_interp_1d;
1008       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1009       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1010       break;
1011     case CEED_EVAL_GRAD:
1012       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1013       data->B.out[i] = basis_data->d_interp_1d;
1014       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1015       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1016       if (useCollograd) {
1017         data->G.out[i] = basis_data->d_collo_grad_1d;
1018         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1019         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1020       } else {
1021         data->G.out[i] = basis_data->d_grad_1d;
1022         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1023         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1024       }
1025       break;
1026     // LCOV_EXCL_START
1027     case CEED_EVAL_WEIGHT: {
1028       Ceed ceed;
1029       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1030       return CeedError(ceed, CEED_ERROR_BACKEND,
1031                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1032       break; // Should not occur
1033     }
1034     case CEED_EVAL_DIV:
1035       break; // TODO: Not implemented
1036     case CEED_EVAL_CURL:
1037       break; // TODO: Not implemented
1038       // LCOV_EXCL_STOP
1039     }
1040   }
1041   code << "\n  // -- Element loop --\n";
1042   code << "  __syncthreads();\n";
1043   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1044   // Input basis apply if needed
1045   // Generate the correct eval mode code for each input
1046   code << "    // -- Input field restrictions and basis actions --\n";
1047   for (CeedInt i = 0; i < numinputfields; i++) {
1048     code << "    // ---- Input field "<<i<<" ----\n";
1049     // Get elemsize, emode, ncomp
1050     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1051     CeedChkBackend(ierr);
1052     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1053     CeedChkBackend(ierr);
1054     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1055     CeedChkBackend(ierr);
1056     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1057     CeedChkBackend(ierr);
1058 
1059     // Restriction
1060     if (emode != CEED_EVAL_WEIGHT &&
1061         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1062       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1063 
1064       bool isStrided;
1065       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1066       if (!isStrided) {
1067         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1068         CeedChkBackend(ierr);
1069         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1070         CeedInt compstride;
1071         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1072         code << "    // CompStride: "<<compstride<<"\n";
1073         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1074         data->indices.in[i] = restr_data->d_ind;
1075         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1076       } else {
1077         bool backendstrides;
1078         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1079         CeedChkBackend(ierr);
1080         CeedInt nelem;
1081         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1082         CeedChkBackend(ierr);
1083         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1084         if (!backendstrides) {
1085           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1086           CeedChkBackend(ierr);
1087         }
1088         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1089         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1090       }
1091     }
1092 
1093     // Basis action
1094     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1095     switch (emode) {
1096     case CEED_EVAL_NONE:
1097       if (!useCollograd) {
1098         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1099       }
1100       break;
1101     case CEED_EVAL_INTERP:
1102       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1103       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1104       break;
1105     case CEED_EVAL_GRAD:
1106       if (useCollograd) {
1107         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1108         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1109       } else {
1110         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1111         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1112       }
1113       break;
1114     case CEED_EVAL_WEIGHT:
1115       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1116       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1117       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1118       data->W = basis_data->d_q_weight_1d;
1119       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1120       break; // No action
1121     case CEED_EVAL_DIV:
1122       break; // TODO: Not implemented
1123     case CEED_EVAL_CURL:
1124       break; // TODO: Not implemented
1125     }
1126   }
1127 
1128   // Q function
1129   code << "\n    // -- Output field setup --\n";
1130   for (CeedInt i = 0; i < numoutputfields; i++) {
1131       code << "\n    // ---- Output field "<<i<<" ----\n";
1132     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1133     CeedChkBackend(ierr);
1134     if (emode==CEED_EVAL_GRAD)
1135     {
1136       if (useCollograd) {
1137         //Accumulator for gradient slices
1138         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1139         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1140         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1141         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1142         code << "      }\n";
1143         code << "    }\n";
1144       } else {
1145         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1146       }
1147     }
1148     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1149     {
1150       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1151     }
1152   }
1153   // We treat quadrature points per slice in 3d to save registers
1154   if (useCollograd) {
1155     code << "\n    // Note: Collocated Gradient\n";
1156     code << "#pragma unroll\n";
1157     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1158     code << "      // -- Input fields --\n";
1159     for (CeedInt i = 0; i < numinputfields; i++) {
1160       code << "      // ---- Input field "<<i<<" ----\n";
1161       // Get elemsize, emode, ncomp
1162       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1163       CeedChkBackend(ierr);
1164       // Basis action
1165       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1166       switch (emode) {
1167       case CEED_EVAL_NONE:
1168         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1169 
1170         bool isStrided;
1171         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1172         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1173         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1174         if (!isStrided) {
1175           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1176           CeedChkBackend(ierr);
1177           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1178           CeedInt compstride;
1179           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1180           code << "      // CompStride: "<<compstride<<"\n";
1181           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1182           data->indices.in[i] = restr_data->d_ind;
1183           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1184         } else {
1185           bool backendstrides;
1186           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1187           CeedChkBackend(ierr);
1188           CeedInt nelem;
1189           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1190           CeedChkBackend(ierr);
1191           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1192           if (!backendstrides) {
1193             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1194             CeedChkBackend(ierr);
1195           }
1196           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1197           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1198         }
1199         break;
1200       case CEED_EVAL_INTERP:
1201         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1202         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1203         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1204         code << "      }\n";
1205         break;
1206       case CEED_EVAL_GRAD:
1207         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1208         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1209         break;
1210       case CEED_EVAL_WEIGHT:
1211         code << "      CeedScalar r_q"<<i<<"[1];\n";
1212         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1213         break; // No action
1214       case CEED_EVAL_DIV:
1215         break; // TODO: Not implemented
1216       case CEED_EVAL_CURL:
1217         break; // TODO: Not implemented
1218       }
1219     }
1220     code << "\n      // -- Output fields --\n";
1221     for (CeedInt i = 0; i < numoutputfields; i++) {
1222       code << "      // ---- Output field "<<i<<" ----\n";
1223       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1224       CeedChkBackend(ierr);
1225       // Basis action
1226       switch (emode) {
1227       case CEED_EVAL_NONE:
1228         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1229         break; // No action
1230       case CEED_EVAL_INTERP:
1231         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1232         break;
1233       case CEED_EVAL_GRAD:
1234         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1235         break;
1236       case CEED_EVAL_WEIGHT:
1237         break; // Should not occur
1238       case CEED_EVAL_DIV:
1239         break; // TODO: Not implemented
1240       case CEED_EVAL_CURL:
1241         break; // TODO: Not implemented
1242       }
1243     }
1244   } else {
1245     code << "\n      // Note: No Collocated Gradient\n";
1246     code << "      // -- Input fields --\n";
1247     for (CeedInt i = 0; i < numinputfields; i++) {
1248       code << "      // ---- Input field "<<i<<" ----\n";
1249       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1250     }
1251     code << "      // -- Output fields --\n";
1252     for (CeedInt i = 0; i < numoutputfields; i++) {
1253       code << "      // ---- Output field "<<i<<" ----\n";
1254       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1255     }
1256   }
1257   code << "\n      // -- QFunction Inputs and outputs --\n";
1258   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1259   for (CeedInt i = 0; i < numinputfields; i++) {
1260     code << "      // ---- Input field "<<i<<" ----\n";
1261     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1262   }
1263   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1264   for (CeedInt i = 0; i < numoutputfields; i++) {
1265     code << "      // ---- Output field "<<i<<" ----\n";
1266     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1267   }
1268   code << "\n      // -- Apply QFunction --\n";
1269   code << "      "<<qFunctionName<<"(ctx, ";
1270   if (dim != 3 || useCollograd) {
1271     code << "1";
1272   } else {
1273     code << "Q1d";
1274   }
1275   code << ", in, out);\n";
1276   if (useCollograd) {
1277     code << "\n      // Note: Collocated Gradient\n";
1278     code << "      // -- Output fields --\n";
1279     for (CeedInt i = 0; i < numoutputfields; i++) {
1280       code << "      // ---- Output field "<<i<<" ----\n";
1281       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1282       CeedChkBackend(ierr);
1283       // Basis action
1284       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1285       switch (emode) {
1286       case CEED_EVAL_NONE:
1287         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1288         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1289         code << "      }\n";
1290         break; // No action
1291       case CEED_EVAL_INTERP:
1292         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1293         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1294         code << "      }\n";
1295         break;
1296       case CEED_EVAL_GRAD:
1297         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1298         break;
1299       case CEED_EVAL_WEIGHT:
1300         break; // Should not occur
1301       case CEED_EVAL_DIV:
1302         break; // TODO: Not implemented
1303       case CEED_EVAL_CURL:
1304         break; // TODO: Not implemented
1305       }
1306     }
1307     code << "    }\n";
1308   }
1309 
1310   // Output basis apply if needed
1311   // Generate the correct eval mode code for each output
1312   code << "\n    // -- Output field basis action and restrictions --\n";
1313   for (CeedInt i = 0; i < numoutputfields; i++) {
1314     code << "    // ---- Output field "<<i<<" ----\n";
1315     // Get elemsize, emode, ncomp
1316     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1317     CeedChkBackend(ierr);
1318     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1319     CeedChkBackend(ierr);
1320     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1321     CeedChkBackend(ierr);
1322     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1323     CeedChkBackend(ierr);
1324     // Basis action
1325     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1326     switch (emode) {
1327     case CEED_EVAL_NONE:
1328       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1329       break; // No action
1330     case CEED_EVAL_INTERP:
1331       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1332       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1333       break;
1334     case CEED_EVAL_GRAD:
1335       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1336       if (useCollograd) {
1337         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1338       } else {
1339         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1340       }
1341       break;
1342     // LCOV_EXCL_START
1343     case CEED_EVAL_WEIGHT: {
1344       Ceed ceed;
1345       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1346       return CeedError(ceed, CEED_ERROR_BACKEND,
1347                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1348       break; // Should not occur
1349     }
1350     case CEED_EVAL_DIV:
1351       break; // TODO: Not implemented
1352     case CEED_EVAL_CURL:
1353       break; // TODO: Not implemented
1354       // LCOV_EXCL_STOP
1355     }
1356     // Restriction
1357       bool isStrided;
1358       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1359     if (!isStrided) {
1360       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1361       CeedChkBackend(ierr);
1362       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1363       CeedInt compstride;
1364       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1365       code << "    // CompStride: "<<compstride<<"\n";
1366       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1367       data->indices.out[i] = restr_data->d_ind;
1368       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1369     } else {
1370       bool backendstrides;
1371       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1372       CeedChkBackend(ierr);
1373       CeedInt nelem;
1374       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1375       CeedChkBackend(ierr);
1376       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1377       if (!backendstrides) {
1378         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1379         CeedChkBackend(ierr);
1380       }
1381       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1382       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1383     }
1384   }
1385 
1386   code << "  }\n";
1387   code << "}\n";
1388   code << "// -----------------------------------------------------------------------------\n\n";
1389 
1390   // View kernel for debugging
1391   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
1392   CeedDebug(ceed, code.str().c_str());
1393 
1394   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
1395                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1396   CeedChkBackend(ierr);
1397   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1398   CeedChkBackend(ierr);
1399 
1400   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1401   return CEED_ERROR_SUCCESS;
1402 }
1403 //------------------------------------------------------------------------------
1404