xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision c51789fdf696ccf5b23e2c5307570755cda1780f)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 #define CEED_DEBUG_COLOR 12
17 
18 #include "ceed-cuda-gen.h"
19 #include <iostream>
20 #include <sstream>
21 #include "../cuda-reg/ceed-cuda-reg.h"
22 #include "../cuda-shared/ceed-cuda-shared.h"
23 
24 static const char *atomicAdd = QUOTE(
25 //------------------------------------------------------------------------------
26 // Atomic add, for older CUDA
27 //------------------------------------------------------------------------------
28 __device__ double atomicAdd(double *address, double val) {
29   unsigned long long int *address_as_ull = (unsigned long long int *)address;
30   unsigned long long int old = *address_as_ull, assumed;
31   do {
32     assumed = old;
33     old =
34       atomicCAS(address_as_ull, assumed,
35                 __double_as_longlong(val +
36                                      __longlong_as_double(assumed)));
37     // Note: uses integer comparison to avoid hang in case of NaN
38     // (since NaN != NaN)
39   } while (assumed != old);
40   return __longlong_as_double(old);
41 }
42 );
43 
44 static const char *deviceFunctions = QUOTE(
45 
46 //------------------------------------------------------------------------------
47 // Typedefs
48 //------------------------------------------------------------------------------
49 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
50 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
51 
52 typedef struct {
53   CeedInt tidx;
54   CeedInt tidy;
55   CeedInt tidz;
56   CeedInt tid;
57   CeedScalar* slice;
58 } BackendData;
59 
60 //------------------------------------------------------------------------------
61 // Load matrices for basis actions
62 //------------------------------------------------------------------------------
63 template <int P, int Q>
64 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
65   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
66     B[i] = d_B[i];
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D
71 //------------------------------------------------------------------------------
72 
73 //------------------------------------------------------------------------------
74 // L-vector -> E-vector, offsets provided
75 //------------------------------------------------------------------------------
76 template <int NCOMP, int COMPSTRIDE, int P1d>
77 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
78   if (data.tidx < P1d) {
79     const CeedInt node = data.tidx;
80     const CeedInt ind = indices[node + elem * P1d];
81     for (CeedInt comp = 0; comp < NCOMP; ++comp)
82       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
83   }
84 }
85 
86 //------------------------------------------------------------------------------
87 // L-vector -> E-vector, strided
88 //------------------------------------------------------------------------------
89 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
90 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
91   if (data.tidx < P1d) {
92     const CeedInt node = data.tidx;
93     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
94     for (CeedInt comp = 0; comp < NCOMP; ++comp)
95       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
96   }
97 }
98 
99 //------------------------------------------------------------------------------
100 // E-vector -> L-vector, offsets provided
101 //------------------------------------------------------------------------------
102 template <int NCOMP, int COMPSTRIDE, int P1d>
103 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
104   if (data.tidx < P1d) {
105     const CeedInt node = data.tidx;
106     const CeedInt ind = indices[node + elem * P1d];
107     for (CeedInt comp = 0; comp < NCOMP; ++comp)
108       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
109   }
110 }
111 
112 //------------------------------------------------------------------------------
113 // E-vector -> L-vector, strided
114 //------------------------------------------------------------------------------
115 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
116 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
117   if (data.tidx < P1d) {
118     const CeedInt node = data.tidx;
119     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
120     for (CeedInt comp = 0; comp < NCOMP; ++comp)
121       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
122   }
123 }
124 
125 //------------------------------------------------------------------------------
126 // 1D tensor contraction x
127 //------------------------------------------------------------------------------
128 template <int NCOMP, int P1d, int Q1d>
129 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130   data.slice[data.tidx] = *U;
131   __syncthreads();
132   *V = 0.0;
133   for (CeedInt i = 0; i < P1d; ++i)
134     *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
135   __syncthreads();
136 }
137 
138 //------------------------------------------------------------------------------
139 // 1D transpose tensor contraction x
140 //------------------------------------------------------------------------------
141 template <int NCOMP, int P1d, int Q1d>
142 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
143   data.slice[data.tidx] = *U;
144   __syncthreads();
145   *V = 0.0;
146   for (CeedInt i = 0; i < Q1d; ++i)
147     *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
148   __syncthreads();
149 }
150 
151 //------------------------------------------------------------------------------
152 // 1D interpolate to quadrature points
153 //------------------------------------------------------------------------------
154 template <int NCOMP, int P1d, int Q1d>
155 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
156   for (CeedInt comp = 0; comp < NCOMP; comp++)
157     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
158 }
159 
160 //------------------------------------------------------------------------------
161 // 1D interpolate transpose
162 //------------------------------------------------------------------------------
163 template <int NCOMP, int P1d, int Q1d>
164 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
165   for (CeedInt comp=0; comp<NCOMP; comp++)
166     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
167 }
168 
169 //------------------------------------------------------------------------------
170 // 1D derivatives at quadrature points
171 //------------------------------------------------------------------------------
172 template <int NCOMP, int P1d, int Q1d>
173 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
174   for (CeedInt comp = 0; comp < NCOMP; comp++)
175     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
176 }
177 
178 //------------------------------------------------------------------------------
179 // 1D derivatives transpose
180 //------------------------------------------------------------------------------
181 template <int NCOMP, int P1d, int Q1d>
182 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
183   for (CeedInt comp = 0; comp < NCOMP; comp++)
184     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
185 }
186 
187 //------------------------------------------------------------------------------
188 // 2D
189 //------------------------------------------------------------------------------
190 
191 //------------------------------------------------------------------------------
192 // L-vector -> E-vector, offsets provided
193 //------------------------------------------------------------------------------
194 template <int NCOMP, int COMPSTRIDE, int P1d>
195 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
196   if (data.tidx < P1d && data.tidy < P1d) {
197     const CeedInt node = data.tidx + data.tidy*P1d;
198     const CeedInt ind = indices[node + elem * P1d*P1d];
199     for (CeedInt comp = 0; comp < NCOMP; ++comp)
200       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
201   }
202 }
203 
204 //------------------------------------------------------------------------------
205 // L-vector -> E-vector, strided
206 //------------------------------------------------------------------------------
207 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
208 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
209   if (data.tidx < P1d && data.tidy < P1d) {
210     const CeedInt node = data.tidx + data.tidy*P1d;
211     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
212     for (CeedInt comp = 0; comp < NCOMP; ++comp)
213       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
214   }
215 }
216 
217 //------------------------------------------------------------------------------
218 // E-vector -> L-vector, offsets provided
219 //------------------------------------------------------------------------------
220 template <int NCOMP, int COMPSTRIDE, int P1d>
221 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
222   if (data.tidx < P1d && data.tidy < P1d) {
223     const CeedInt node = data.tidx + data.tidy*P1d;
224     const CeedInt ind = indices[node + elem * P1d*P1d];
225     for (CeedInt comp = 0; comp < NCOMP; ++comp)
226       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
227   }
228 }
229 
230 //------------------------------------------------------------------------------
231 // E-vector -> L-vector, strided
232 //------------------------------------------------------------------------------
233 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
234 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
235   if (data.tidx < P1d && data.tidy < P1d) {
236     const CeedInt node = data.tidx + data.tidy*P1d;
237     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
238     for (CeedInt comp = 0; comp < NCOMP; ++comp)
239       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
240   }
241 }
242 
243 //------------------------------------------------------------------------------
244 // 2D tensor contraction x
245 //------------------------------------------------------------------------------
246 template <int NCOMP, int P1d, int Q1d>
247 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
248   data.slice[data.tidx+data.tidy*Q1d] = *U;
249   __syncthreads();
250   *V = 0.0;
251   for (CeedInt i = 0; i < P1d; ++i)
252     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
253   __syncthreads();
254 }
255 
256 //------------------------------------------------------------------------------
257 // 2D tensor contract y
258 //------------------------------------------------------------------------------
259 template <int NCOMP, int P1d, int Q1d>
260 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
261   data.slice[data.tidx+data.tidy*Q1d] = *U;
262   __syncthreads();
263   *V = 0.0;
264   for (CeedInt i = 0; i < P1d; ++i)
265     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
266   __syncthreads();
267 }
268 
269 //------------------------------------------------------------------------------
270 // 2D transpose tensor contract y
271 //------------------------------------------------------------------------------
272 template <int NCOMP, int P1d, int Q1d>
273 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
274   data.slice[data.tidx+data.tidy*Q1d] = *U;
275   __syncthreads();
276   *V = 0.0;
277   if (data.tidy < P1d)
278     for (CeedInt i = 0; i < Q1d; ++i)
279       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
280   __syncthreads();
281 }
282 
283 //------------------------------------------------------------------------------
284 // 2D transpose tensor contract x
285 //------------------------------------------------------------------------------
286 template <int NCOMP, int P1d, int Q1d>
287 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
288   data.slice[data.tidx+data.tidy*Q1d] = *U;
289   __syncthreads();
290   *V = 0.0;
291   if (data.tidx < P1d)
292     for (CeedInt i = 0; i < Q1d; ++i)
293       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
294   __syncthreads();
295 }
296 
297 //------------------------------------------------------------------------------
298 // 2D transpose tensor contract and add x
299 //------------------------------------------------------------------------------
300 template <int NCOMP, int P1d, int Q1d>
301 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
302   data.slice[data.tidx+data.tidy*Q1d] = *U;
303   __syncthreads();
304   if (data.tidx < P1d)
305     for (CeedInt i = 0; i < Q1d; ++i)
306       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
307   __syncthreads();
308 }
309 
310 //------------------------------------------------------------------------------
311 // 2D interpolate to quadrature points
312 //------------------------------------------------------------------------------
313 template <int NCOMP, int P1d, int Q1d>
314 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
315   CeedScalar r_t[1];
316   for (CeedInt comp = 0; comp < NCOMP; comp++) {
317     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
318     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
319   }
320 }
321 
322 //------------------------------------------------------------------------------
323 // 2D interpolate transpose
324 //------------------------------------------------------------------------------
325 template <int NCOMP, int P1d, int Q1d>
326 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
327   CeedScalar r_t[1];
328   for (CeedInt comp = 0; comp < NCOMP; comp++) {
329     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
330     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
331   }
332 }
333 
334 //------------------------------------------------------------------------------
335 // 2D derivatives at quadrature points
336 //------------------------------------------------------------------------------
337 template <int NCOMP, int P1d, int Q1d>
338 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
339   CeedScalar r_t[1];
340   for (CeedInt comp = 0; comp < NCOMP; comp++) {
341     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
342     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
343     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
344     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
345   }
346 }
347 
348 //------------------------------------------------------------------------------
349 // 2D derivatives transpose
350 //------------------------------------------------------------------------------
351 template <int NCOMP, int P1d, int Q1d>
352 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
353   CeedScalar r_t[1];
354   for (CeedInt comp = 0; comp < NCOMP; comp++) {
355     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
356     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
357     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
358     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
359   }
360 }
361 
362 //------------------------------------------------------------------------------
363 // 3D
364 //------------------------------------------------------------------------------
365 
366 //------------------------------------------------------------------------------
367 // L-vector -> E-vector, offsets provided
368 //------------------------------------------------------------------------------
369 template <int NCOMP, int COMPSTRIDE, int P1d>
370 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
371   if (data.tidx < P1d && data.tidy < P1d)
372     for (CeedInt z = 0; z < P1d; ++z) {
373       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
374       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
375       for (CeedInt comp = 0; comp < NCOMP; ++comp)
376         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
377     }
378 }
379 
380 //------------------------------------------------------------------------------
381 // L-vector -> E-vector, strided
382 //------------------------------------------------------------------------------
383 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
384 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
385   if (data.tidx < P1d && data.tidy < P1d)
386     for (CeedInt z = 0; z < P1d; ++z) {
387       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
388       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
389       for (CeedInt comp = 0; comp < NCOMP; ++comp)
390         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
391     }
392 }
393 
394 //------------------------------------------------------------------------------
395 // E-vector -> Q-vector, offests provided
396 //------------------------------------------------------------------------------
397 template <int NCOMP, int COMPSTRIDE, int Q1d>
398 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
399   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
400   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
401   for (CeedInt comp = 0; comp < NCOMP; ++comp)
402     r_u[comp] = d_u[ind + COMPSTRIDE * comp];
403 }
404 
405 //------------------------------------------------------------------------------
406 // E-vector -> Q-vector, strided
407 //------------------------------------------------------------------------------
408 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
409 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
410   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
411   const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
412   for (CeedInt comp = 0; comp < NCOMP; ++comp)
413     r_u[comp] = d_u[ind + comp * STRIDES_COMP];
414 }
415 
416 //------------------------------------------------------------------------------
417 // E-vector -> L-vector, offsets provided
418 //------------------------------------------------------------------------------
419 template <int NCOMP, int COMPSTRIDE, int P1d>
420 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
421   if (data.tidx < P1d && data.tidy < P1d)
422     for (CeedInt z = 0; z < P1d; ++z) {
423       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
424       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
425       for (CeedInt comp = 0; comp < NCOMP; ++comp)
426         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
427     }
428 }
429 
430 //------------------------------------------------------------------------------
431 // E-vector -> L-vector, strided
432 //------------------------------------------------------------------------------
433 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
434 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
435   if (data.tidx < P1d && data.tidy < P1d)
436     for (CeedInt z = 0; z < P1d; ++z) {
437       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
438       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
439       for (CeedInt comp = 0; comp < NCOMP; ++comp)
440         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
441     }
442 }
443 
444 //------------------------------------------------------------------------------
445 // 3D tensor contract x
446 //------------------------------------------------------------------------------
447 template <int NCOMP, int P1d, int Q1d>
448 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
449   CeedScalar r_B[P1d];
450   for (CeedInt i = 0; i < P1d; ++i)
451     r_B[i] = B[i + data.tidx*P1d];
452 
453   for (CeedInt k = 0; k < P1d; ++k) {
454     data.slice[data.tidx+data.tidy*Q1d] = U[k];
455     __syncthreads();
456     V[k] = 0.0;
457     for (CeedInt i = 0; i < P1d; ++i)
458       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
459     __syncthreads();
460   }
461 }
462 
463 //------------------------------------------------------------------------------
464 // 3D tensor contract y
465 //------------------------------------------------------------------------------
466 template <int NCOMP, int P1d, int Q1d>
467 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
468   CeedScalar r_B[P1d];
469   for (CeedInt i = 0; i < P1d; ++i)
470     r_B[i] = B[i + data.tidy*P1d];
471 
472   for (CeedInt k = 0; k < P1d; ++k) {
473     data.slice[data.tidx+data.tidy*Q1d] = U[k];
474     __syncthreads();
475     V[k] = 0.0;
476     for (CeedInt i = 0; i < P1d; ++i)
477       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
478     __syncthreads();
479   }
480 }
481 
482 //------------------------------------------------------------------------------
483 // 3D tensor contract z
484 //------------------------------------------------------------------------------
485 template <int NCOMP, int P1d, int Q1d>
486 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
487   for (CeedInt k = 0; k < Q1d; ++k) {
488     V[k] = 0.0;
489     for (CeedInt i = 0; i < P1d; ++i)
490       V[k] += B[i + k*P1d] * U[i]; // Contract z direction
491   }
492 }
493 
494 //------------------------------------------------------------------------------
495 // 3D transpose tensor contract z
496 //------------------------------------------------------------------------------
497 template <int NCOMP, int P1d, int Q1d>
498 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
499   for (CeedInt k = 0; k < Q1d; ++k) {
500     V[k] = 0.0;
501     if (k < P1d)
502       for (CeedInt i = 0; i < Q1d; ++i)
503         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
504   }
505 }
506 
507 //------------------------------------------------------------------------------
508 // 3D transpose tensor contract y
509 //------------------------------------------------------------------------------
510 template <int NCOMP, int P1d, int Q1d>
511 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
512   CeedScalar r_B[Q1d];
513   for (CeedInt i = 0; i < Q1d; ++i)
514     r_B[i] = B[data.tidy + i*P1d];
515 
516   for (CeedInt k = 0; k < P1d; ++k) {
517     data.slice[data.tidx+data.tidy*Q1d] = U[k];
518     __syncthreads();
519     V[k] = 0.0;
520     if (data.tidy < P1d)
521       for (CeedInt i = 0; i < Q1d; ++i)
522         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
523     __syncthreads();
524   }
525 }
526 
527 //------------------------------------------------------------------------------
528 // 3D transpose tensor contract add y
529 //------------------------------------------------------------------------------
530 template <int NCOMP, int P1d, int Q1d>
531 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
532   CeedScalar r_B[Q1d];
533   for (CeedInt i = 0; i < Q1d; ++i)
534     r_B[i] = B[data.tidy + i*P1d];
535 
536   for (CeedInt k = 0; k < P1d; ++k) {
537     data.slice[data.tidx+data.tidy*Q1d] = U[k];
538     __syncthreads();
539     if (data.tidy < P1d)
540       for (CeedInt i = 0; i < Q1d; ++i)
541         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
542     __syncthreads();
543   }
544 }
545 
546 //------------------------------------------------------------------------------
547 // 3D transpose tensor contract x
548 //------------------------------------------------------------------------------
549 template <int NCOMP, int P1d, int Q1d>
550 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
551   CeedScalar r_B[Q1d];
552   for (CeedInt i = 0; i < Q1d; ++i)
553     r_B[i] = B[data.tidx + i*P1d];
554 
555   for (CeedInt k = 0; k < P1d; ++k) {
556     data.slice[data.tidx+data.tidy*Q1d] = U[k];
557     __syncthreads();
558     V[k] = 0.0;
559     if (data.tidx < P1d)
560       for (CeedInt i = 0; i < Q1d; ++i)
561         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
562     __syncthreads();
563   }
564 }
565 
566 //------------------------------------------------------------------------------
567 // 3D transpose tensor contract add x
568 //------------------------------------------------------------------------------
569 template <int NCOMP, int P1d, int Q1d>
570 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
571   CeedScalar r_B[Q1d];
572   for (CeedInt i = 0; i < Q1d; ++i)
573     r_B[i] = B[data.tidx + i*P1d];
574 
575   for (CeedInt k = 0; k < P1d; ++k) {
576     data.slice[data.tidx+data.tidy*Q1d] = U[k];
577     __syncthreads();
578     if (data.tidx < P1d)
579       for (CeedInt i = 0; i < Q1d; ++i)
580         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
581     __syncthreads();
582   }
583 }
584 
585 //------------------------------------------------------------------------------
586 // 3D interpolate to quadrature points
587 //------------------------------------------------------------------------------
588 template <int NCOMP, int P1d, int Q1d>
589 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
590   CeedScalar r_t1[Q1d];
591   CeedScalar r_t2[Q1d];
592   for (CeedInt comp = 0; comp < NCOMP; comp++) {
593     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
594     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
595     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
596   }
597 }
598 
599 //------------------------------------------------------------------------------
600 // 3D interpolate transpose
601 //------------------------------------------------------------------------------
602 template <int NCOMP, int P1d, int Q1d>
603 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
604   CeedScalar r_t1[Q1d];
605   CeedScalar r_t2[Q1d];
606   for (CeedInt comp = 0; comp < NCOMP; comp++) {
607     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
608     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
609     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
610   }
611 }
612 
613 //------------------------------------------------------------------------------
614 // 3D derivatives at quadrature points
615 //------------------------------------------------------------------------------
616 template <int NCOMP, int P1d, int Q1d>
617 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
618   CeedScalar r_t1[Q1d];
619   CeedScalar r_t2[Q1d];
620   for (CeedInt comp = 0; comp < NCOMP; comp++) {
621     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
622     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
623     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
624     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
625     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
626     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
627     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
628     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
629     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
630   }
631 }
632 
633 //------------------------------------------------------------------------------
634 // 3D derivatives transpose
635 //------------------------------------------------------------------------------
636 template <int NCOMP, int P1d, int Q1d>
637 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
638   CeedScalar r_t1[Q1d];
639   CeedScalar r_t2[Q1d];
640   for (CeedInt comp = 0; comp < NCOMP; comp++) {
641     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
642     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
643     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
644     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
645     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
646     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
647     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
648     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
649     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
650   }
651 }
652 
653 //------------------------------------------------------------------------------
654 // 3D collocated derivatives computation
655 //------------------------------------------------------------------------------
656 template <int NCOMP, int Q1d>
657 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
658   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
659     data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d];
660     __syncthreads();
661     // X derivative
662     r_V[comp+0*NCOMP] = 0.0;
663     for (CeedInt i = 0; i < Q1d; ++i)
664       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
665     // Y derivative
666     r_V[comp+1*NCOMP] = 0.0;
667     for (CeedInt i = 0; i < Q1d; ++i)
668       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
669     // Z derivative
670     r_V[comp+2*NCOMP] = 0.0;
671     for (CeedInt i = 0; i < Q1d; ++i)
672       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
673     __syncthreads();
674   }
675 }
676 
677 //------------------------------------------------------------------------------
678 // 3D collocated derivatives transpose
679 //------------------------------------------------------------------------------
680 template <int NCOMP, int Q1d>
681 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
682   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
683     // X derivative
684     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP];
685     __syncthreads();
686     for (CeedInt i = 0; i < Q1d; ++i)
687       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
688     __syncthreads();
689     // Y derivative
690     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP];
691     __syncthreads();
692     for (CeedInt i = 0; i < Q1d; ++i)
693       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
694     __syncthreads();
695     // Z derivative
696     for (CeedInt i = 0; i < Q1d; ++i)
697       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
698   }
699 }
700 
701 //------------------------------------------------------------------------------
702 // 1D quadrature weights
703 //------------------------------------------------------------------------------
704 template <int Q1d>
705 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
706   *w = qweight1d[data.tidx];
707 }
708 
709 //------------------------------------------------------------------------------
710 // 2D quadrature weights
711 //------------------------------------------------------------------------------
712 template <int Q1d>
713 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
714   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
715 }
716 
717 //------------------------------------------------------------------------------
718 // 3D quadrature weights
719 //------------------------------------------------------------------------------
720 template <int Q1d>
721 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
722   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
723   for (CeedInt z = 0; z < Q1d; ++z)
724     w[z] = pw*qweight1d[z];
725 }
726 
727 );
728 //------------------------------------------------------------------------------
729 // Build singe operator kernel
730 //------------------------------------------------------------------------------
731 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
732 
733   using std::ostringstream;
734   using std::string;
735   int ierr;
736   bool setupdone;
737   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
738   if (setupdone) return 0;
739   Ceed ceed;
740   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
741   CeedOperator_Cuda_gen *data;
742   ierr = CeedOperatorGetData(op, &data); CeedChk(ierr);
743   CeedQFunction qf;
744   CeedQFunction_Cuda_gen *qf_data;
745   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
746   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr);
747   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
748           numoutputfields, ncomp, dim = 0, lsize;
749   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
750   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
751   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
752   CeedChk(ierr);
753   CeedOperatorField *opinputfields, *opoutputfields;
754   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
755   CeedChk(ierr);
756   CeedQFunctionField *qfinputfields, *qfoutputfields;
757   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
758   CeedChk(ierr);
759   CeedEvalMode emode;
760   CeedBasis basis;
761   CeedBasis_Cuda_shared *basis_data;
762   CeedElemRestriction Erestrict;
763   CeedElemRestriction_Cuda_reg *restr_data;
764 
765   ostringstream code;
766   string devFunctions(deviceFunctions);
767 
768   // Add atomicAdd function for old NVidia architectures
769   struct cudaDeviceProp prop;
770   Ceed_Cuda *ceed_data;
771   ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr);
772   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
773   if (prop.major<6){
774     code << atomicAdd;
775   }
776 
777   code << devFunctions;
778 
779   string qFunction(qf_data->qFunctionSource);
780   string qFunctionName(qf_data->qFunctionName);
781   string oper;
782   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
783 
784   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
785   code << "\n#define CeedPragmaSIMD\n";
786 
787   // Find dim and Q1d
788   bool collograd = false;
789   for (CeedInt i = 0; i < numinputfields; i++) {
790     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
791     if (basis != CEED_BASIS_COLLOCATED) {
792       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
793 
794       // Check for collocated gradient
795       if (basis_data->d_collograd1d)
796         collograd = true;
797 
798       // Collect dim and Q1d
799       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
800       bool isTensor;
801       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
802       if (isTensor) {
803         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
804       } else {
805         // LCOV_EXCL_START
806         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
807         // LCOV_EXCL_STOP
808         }
809     }
810   }
811   // Check output bases for Q1d, dim as well
812   //   The only imput basis might be CEED_BASIS_COLLOCATED
813   for (CeedInt i = 0; i < numoutputfields; i++) {
814     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
815     if (basis != CEED_BASIS_COLLOCATED) {
816       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
817       // Collect dim and Q1d
818       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
819       bool isTensor;
820       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
821       if (isTensor) {
822         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
823       } else {
824         // LCOV_EXCL_START
825         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
826         // LCOV_EXCL_STOP
827         }
828     }
829   }
830   data->dim = dim;
831   data->Q1d = Q1d;
832 
833   // Define CEED_Q_VLA
834   if (dim != 3 || collograd) {
835     code << "\n#define CEED_Q_VLA 1\n\n";
836   } else {
837     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
838   }
839 
840   code << qFunction;
841 
842   // Setup
843   code << "\n// -----------------------------------------------------------------------------\n";
844   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
845   for (CeedInt i = 0; i < numinputfields; i++) {
846     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
847     CeedChk(ierr);
848     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
849       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
850     }
851   }
852 
853   for (CeedInt i = 0; i < numoutputfields; i++) {
854     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
855   }
856 
857   code << "  const CeedInt Dim = "<<dim<<";\n";
858   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
859 
860   code << "  extern __shared__ CeedScalar slice[];\n";
861   code << "  BackendData data;\n";
862   code << "  data.tidx = threadIdx.x;\n";
863   code << "  data.tidy = threadIdx.y;\n";
864   code << "  data.tidz = threadIdx.z;\n";
865   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
866   code << "  data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
867 
868   code << "\n  // -- Input field constants and basis data --\n";
869   //Initialize constants, and matrices B and G
870   for (CeedInt i = 0; i < numinputfields; i++) {
871     code << "  // ---- Input field "<<i<<" ----\n";
872     // Get elemsize, emode, ncomp
873     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
874     CeedChk(ierr);
875     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
876     CeedChk(ierr);
877     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
878     CeedChk(ierr);
879     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
880     CeedChk(ierr);
881 
882     // Set field constants
883     if (emode != CEED_EVAL_WEIGHT) {
884       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
885       if (basis != CEED_BASIS_COLLOCATED) {
886         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
887         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
888       } else {
889         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
890       }
891       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
892     }
893 
894     // Load basis data
895     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
896     switch (emode) {
897     case CEED_EVAL_NONE:
898       break;
899     case CEED_EVAL_INTERP:
900       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
901       data->B.in[i] = basis_data->d_interp1d;
902       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
903       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
904       break;
905     case CEED_EVAL_GRAD:
906       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
907       data->B.in[i] = basis_data->d_interp1d;
908       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
909       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
910       if (basis_data->d_collograd1d) {
911         data->G.in[i] = basis_data->d_collograd1d;
912         code << "  __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
913         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
914       } else {
915         data->G.in[i] = basis_data->d_grad1d;
916         code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
917         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
918       }
919       break;
920     case CEED_EVAL_WEIGHT:
921       break; // No action
922     case CEED_EVAL_DIV:
923       break; // TODO: Not implemented
924     case CEED_EVAL_CURL:
925       break; // TODO: Not implemented
926     }
927   }
928 
929   code << "\n  // -- Output field constants and basis data --\n";
930   for (CeedInt i = 0; i < numoutputfields; i++) {
931     code << "  // ---- Output field "<<i<<" ----\n";
932     // Get elemsize, emode, ncomp
933     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
934     CeedChk(ierr);
935     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
936     CeedChk(ierr);
937     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
938     CeedChk(ierr);
939     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
940     CeedChk(ierr);
941 
942     // Set field constants
943     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
944     if (basis != CEED_BASIS_COLLOCATED) {
945       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
946       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
947     } else {
948       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
949     }
950     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
951 
952     // Load basis data
953     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
954     switch (emode) {
955     case CEED_EVAL_NONE:
956       break; // No action
957     case CEED_EVAL_INTERP:
958       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
959       data->B.out[i] = basis_data->d_interp1d;
960       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
961       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
962       break;
963     case CEED_EVAL_GRAD:
964       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
965       data->B.out[i] = basis_data->d_interp1d;
966       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
967       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
968       if (basis_data->d_collograd1d) {
969         data->G.out[i] = basis_data->d_collograd1d;
970         code << "  __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
971         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
972       } else {
973         data->G.out[i] = basis_data->d_grad1d;
974         code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
975         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
976       }
977       break;
978     // LCOV_EXCL_START
979     case CEED_EVAL_WEIGHT: {
980       Ceed ceed;
981       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
982       return CeedError(ceed, 1,
983                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
984       break; // Should not occur
985     }
986     case CEED_EVAL_DIV:
987       break; // TODO: Not implemented
988     case CEED_EVAL_CURL:
989       break; // TODO: Not implemented
990       // LCOV_EXCL_STOP
991     }
992   }
993   code << "\n  // -- Element loop --\n";
994   code << "  __syncthreads();\n";
995   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
996   // Input basis apply if needed
997   // Generate the correct eval mode code for each input
998   code << "    // -- Input field restrictions and basis actions --\n";
999   for (CeedInt i = 0; i < numinputfields; i++) {
1000     code << "    // ---- Input field "<<i<<" ----\n";
1001     // Get elemsize, emode, ncomp
1002     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1003     CeedChk(ierr);
1004     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1005     CeedChk(ierr);
1006     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1007     CeedChk(ierr);
1008     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1009     CeedChk(ierr);
1010 
1011     // Restriction
1012     if (emode != CEED_EVAL_WEIGHT &&
1013         !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) {
1014       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1015 
1016       bool isStrided;
1017       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1018       if (!isStrided) {
1019         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1020         CeedChk(ierr);
1021         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1022         CeedInt compstride;
1023         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1024         code << "    // CompStride: "<<compstride<<"\n";
1025         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1026         data->indices.in[i] = restr_data->d_ind;
1027         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1028       } else {
1029         bool backendstrides;
1030         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1031         CeedChk(ierr);
1032         CeedInt nelem;
1033         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1034         CeedChk(ierr);
1035         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1036         if (!backendstrides) {
1037           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1038           CeedChk(ierr);
1039         }
1040         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1041         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1042       }
1043     }
1044 
1045     // Basis action
1046     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1047     switch (emode) {
1048     case CEED_EVAL_NONE:
1049       if (!basis_data->d_collograd1d) {
1050         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1051       }
1052       break;
1053     case CEED_EVAL_INTERP:
1054       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1055       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1056       break;
1057     case CEED_EVAL_GRAD:
1058       if (basis_data->d_collograd1d) {
1059         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1060         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1061       } else {
1062         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1063         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1064       }
1065       break;
1066     case CEED_EVAL_WEIGHT:
1067       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1068       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1069       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
1070       data->W = basis_data->d_qweight1d;
1071       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1072       break; // No action
1073     case CEED_EVAL_DIV:
1074       break; // TODO: Not implemented
1075     case CEED_EVAL_CURL:
1076       break; // TODO: Not implemented
1077     }
1078   }
1079 
1080   // Q function
1081   code << "\n    // -- Output field setup --\n";
1082   for (CeedInt i = 0; i < numoutputfields; i++) {
1083       code << "\n    // ---- Output field "<<i<<" ----\n";
1084     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1085     CeedChk(ierr);
1086     if (emode==CEED_EVAL_GRAD)
1087     {
1088       if (basis_data->d_collograd1d) {
1089         //Accumulator for gradient slices
1090         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1091         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1092         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1093         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1094         code << "      }\n";
1095         code << "    }\n";
1096       } else {
1097         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1098       }
1099     }
1100     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1101     {
1102       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1103     }
1104   }
1105   // We treat quadrature points per slice in 3d to save registers
1106   if (basis_data->d_collograd1d) {
1107     code << "\n    // Note: Collocated Gradient\n";
1108     code << "#pragma unroll\n";
1109     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1110     code << "      // -- Input fields --\n";
1111     for (CeedInt i = 0; i < numinputfields; i++) {
1112       code << "      // ---- Input field "<<i<<" ----\n";
1113       // Get elemsize, emode, ncomp
1114       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1115       CeedChk(ierr);
1116       // Basis action
1117       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1118       switch (emode) {
1119       case CEED_EVAL_NONE:
1120         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1121 
1122         bool isStrided;
1123         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr);
1124         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr);
1125         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1126         if (!isStrided) {
1127           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1128           CeedChk(ierr);
1129           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1130           CeedInt compstride;
1131           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1132           code << "      // CompStride: "<<compstride<<"\n";
1133           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1134           data->indices.in[i] = restr_data->d_ind;
1135           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1136         } else {
1137           bool backendstrides;
1138           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1139           CeedChk(ierr);
1140           CeedInt nelem;
1141           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1142           CeedChk(ierr);
1143           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1144           if (!backendstrides) {
1145             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1146             CeedChk(ierr);
1147           }
1148           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1149           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1150         }
1151         break;
1152       case CEED_EVAL_INTERP:
1153         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1154         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1155         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1156         code << "      }\n";
1157         break;
1158       case CEED_EVAL_GRAD:
1159         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1160         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1161         break;
1162       case CEED_EVAL_WEIGHT:
1163         code << "      CeedScalar r_q"<<i<<"[1];\n";
1164         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1165         break; // No action
1166       case CEED_EVAL_DIV:
1167         break; // TODO: Not implemented
1168       case CEED_EVAL_CURL:
1169         break; // TODO: Not implemented
1170       }
1171     }
1172     code << "\n      // -- Output fields --\n";
1173     for (CeedInt i = 0; i < numoutputfields; i++) {
1174       code << "      // ---- Output field "<<i<<" ----\n";
1175       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1176       CeedChk(ierr);
1177       // Basis action
1178       switch (emode) {
1179       case CEED_EVAL_NONE:
1180         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1181         break; // No action
1182       case CEED_EVAL_INTERP:
1183         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1184         break;
1185       case CEED_EVAL_GRAD:
1186         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1187         break;
1188       case CEED_EVAL_WEIGHT:
1189         break; // Should not occur
1190       case CEED_EVAL_DIV:
1191         break; // TODO: Not implemented
1192       case CEED_EVAL_CURL:
1193         break; // TODO: Not implemented
1194       }
1195     }
1196   } else {
1197     code << "\n      // Note: No Collocated Gradient\n";
1198     code << "      // -- Input fields --\n";
1199     for (CeedInt i = 0; i < numinputfields; i++) {
1200       code << "      // ---- Input field "<<i<<" ----\n";
1201       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1202     }
1203     code << "      // -- Output fields --\n";
1204     for (CeedInt i = 0; i < numoutputfields; i++) {
1205       code << "      // ---- Output field "<<i<<" ----\n";
1206       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1207     }
1208   }
1209   code << "\n      // -- QFunction Inputs and outputs --\n";
1210   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1211   for (CeedInt i = 0; i < numinputfields; i++) {
1212     code << "      // ---- Input field "<<i<<" ----\n";
1213     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1214   }
1215   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1216   for (CeedInt i = 0; i < numoutputfields; i++) {
1217     code << "      // ---- Output field "<<i<<" ----\n";
1218     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1219   }
1220   code << "\n      // -- Apply QFunction --\n";
1221   code << "      "<<qFunctionName<<"(ctx, ";
1222   if (dim != 3 || basis_data->d_collograd1d) {
1223     code << "1";
1224   } else {
1225     code << "Q1d";
1226   }
1227   code << ", in, out);\n";
1228   if (basis_data->d_collograd1d) {
1229     code << "\n      // Note: Collocated Gradient\n";
1230     code << "      // -- Output fields --\n";
1231     for (CeedInt i = 0; i < numoutputfields; i++) {
1232       code << "      // ---- Output field "<<i<<" ----\n";
1233       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1234       CeedChk(ierr);
1235       // Basis action
1236       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1237       switch (emode) {
1238       case CEED_EVAL_NONE:
1239         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1240         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1241         code << "      }\n";
1242         break; // No action
1243       case CEED_EVAL_INTERP:
1244         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1245         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1246         code << "      }\n";
1247         break;
1248       case CEED_EVAL_GRAD:
1249         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1250         break;
1251       case CEED_EVAL_WEIGHT:
1252         break; // Should not occur
1253       case CEED_EVAL_DIV:
1254         break; // TODO: Not implemented
1255       case CEED_EVAL_CURL:
1256         break; // TODO: Not implemented
1257       }
1258     }
1259     code << "    }\n";
1260   }
1261 
1262   // Output basis apply if needed
1263   // Generate the correct eval mode code for each output
1264   code << "\n    // -- Output field basis action and restrictions --\n";
1265   for (CeedInt i = 0; i < numoutputfields; i++) {
1266     code << "    // ---- Output field "<<i<<" ----\n";
1267     // Get elemsize, emode, ncomp
1268     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1269     CeedChk(ierr);
1270     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1271     CeedChk(ierr);
1272     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1273     CeedChk(ierr);
1274     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1275     CeedChk(ierr);
1276     // Basis action
1277     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1278     switch (emode) {
1279     case CEED_EVAL_NONE:
1280       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1281       break; // No action
1282     case CEED_EVAL_INTERP:
1283       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1284       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1285       break;
1286     case CEED_EVAL_GRAD:
1287       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1288       if (basis_data->d_collograd1d) {
1289         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1290       } else {
1291         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1292       }
1293       break;
1294     // LCOV_EXCL_START
1295     case CEED_EVAL_WEIGHT: {
1296       Ceed ceed;
1297       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1298       return CeedError(ceed, 1,
1299                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1300       break; // Should not occur
1301     }
1302     case CEED_EVAL_DIV:
1303       break; // TODO: Not implemented
1304     case CEED_EVAL_CURL:
1305       break; // TODO: Not implemented
1306       // LCOV_EXCL_STOP
1307     }
1308     // Restriction
1309       bool isStrided;
1310       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1311     if (!isStrided) {
1312       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1313       CeedChk(ierr);
1314       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1315       CeedInt compstride;
1316       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1317       code << "    // CompStride: "<<compstride<<"\n";
1318       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1319       data->indices.out[i] = restr_data->d_ind;
1320       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1321     } else {
1322       bool backendstrides;
1323       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1324       CeedChk(ierr);
1325       CeedInt nelem;
1326       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1327       CeedChk(ierr);
1328       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1329       if (!backendstrides) {
1330         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1331         CeedChk(ierr);
1332       }
1333       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1334       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1335     }
1336   }
1337 
1338   code << "  }\n";
1339   code << "}\n";
1340   code << "// -----------------------------------------------------------------------------\n\n";
1341 
1342   // View kernel for debugging
1343   CeedDebug(code.str().c_str());
1344 
1345   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0);
1346   CeedChk(ierr);
1347   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1348   CeedChk(ierr);
1349 
1350   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1351   return 0;
1352 }
1353 //------------------------------------------------------------------------------
1354