xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 64d3f0c08261acc2586bed8a7fe6b77e9bb9f0e1)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 #define CEED_DEBUG_COLOR 12
17 
18 #include "ceed-cuda-gen.h"
19 #include <iostream>
20 #include <sstream>
21 #include "../cuda-reg/ceed-cuda-reg.h"
22 #include "../cuda-shared/ceed-cuda-shared.h"
23 
24 static const char *atomicAdd = QUOTE(
25 //------------------------------------------------------------------------------
26 // Atomic add, for older CUDA
27 //------------------------------------------------------------------------------
28 __device__ double atomicAdd(double *address, double val) {
29   unsigned long long int *address_as_ull = (unsigned long long int *)address;
30   unsigned long long int old = *address_as_ull, assumed;
31   do {
32     assumed = old;
33     old =
34       atomicCAS(address_as_ull, assumed,
35                 __double_as_longlong(val +
36                                      __longlong_as_double(assumed)));
37     // Note: uses integer comparison to avoid hang in case of NaN
38     // (since NaN != NaN)
39   } while (assumed != old);
40   return __longlong_as_double(old);
41 }
42 );
43 
44 static const char *deviceFunctions = QUOTE(
45 
46 //------------------------------------------------------------------------------
47 // Typedefs
48 //------------------------------------------------------------------------------
49 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
50 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
51 
52 typedef struct {
53   CeedInt tidx;
54   CeedInt tidy;
55   CeedInt tidz;
56   CeedInt tid;
57   CeedScalar* slice;
58 } BackendData;
59 
60 //------------------------------------------------------------------------------
61 // Load matrices for basis actions
62 //------------------------------------------------------------------------------
63 template <int P, int Q>
64 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
65   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
66     B[i] = d_B[i];
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D
71 //------------------------------------------------------------------------------
72 
73 //------------------------------------------------------------------------------
74 // L-vector -> E-vector, offsets provided
75 //------------------------------------------------------------------------------
76 template <int NCOMP, int COMPSTRIDE, int P1d>
77 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
78   if (data.tidx < P1d) {
79     const CeedInt node = data.tidx;
80     const CeedInt ind = indices[node + elem * P1d];
81     for (CeedInt comp = 0; comp < NCOMP; ++comp)
82       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
83   }
84 }
85 
86 //------------------------------------------------------------------------------
87 // L-vector -> E-vector, strided
88 //------------------------------------------------------------------------------
89 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
90 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
91   if (data.tidx < P1d) {
92     const CeedInt node = data.tidx;
93     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
94     for (CeedInt comp = 0; comp < NCOMP; ++comp)
95       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
96   }
97 }
98 
99 //------------------------------------------------------------------------------
100 // E-vector -> L-vector, offsets provided
101 //------------------------------------------------------------------------------
102 template <int NCOMP, int COMPSTRIDE, int P1d>
103 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
104   if (data.tidx < P1d) {
105     const CeedInt node = data.tidx;
106     const CeedInt ind = indices[node + elem * P1d];
107     for (CeedInt comp = 0; comp < NCOMP; ++comp)
108       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
109   }
110 }
111 
112 //------------------------------------------------------------------------------
113 // E-vector -> L-vector, strided
114 //------------------------------------------------------------------------------
115 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
116 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
117   if (data.tidx < P1d) {
118     const CeedInt node = data.tidx;
119     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
120     for (CeedInt comp = 0; comp < NCOMP; ++comp)
121       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
122   }
123 }
124 
125 //------------------------------------------------------------------------------
126 // 1D tensor contraction x
127 //------------------------------------------------------------------------------
128 template <int NCOMP, int P1d, int Q1d>
129 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130   data.slice[data.tidx] = *U;
131   __syncthreads();
132   *V = 0.0;
133   for (CeedInt i = 0; i < P1d; ++i)
134     *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
135   __syncthreads();
136 }
137 
138 //------------------------------------------------------------------------------
139 // 1D transpose tensor contraction x
140 //------------------------------------------------------------------------------
141 template <int NCOMP, int P1d, int Q1d>
142 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
143   data.slice[data.tidx] = *U;
144   __syncthreads();
145   *V = 0.0;
146   for (CeedInt i = 0; i < Q1d; ++i)
147     *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
148   __syncthreads();
149 }
150 
151 //------------------------------------------------------------------------------
152 // 1D interpolate to quadrature points
153 //------------------------------------------------------------------------------
154 template <int NCOMP, int P1d, int Q1d>
155 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
156   for (CeedInt comp = 0; comp < NCOMP; comp++)
157     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
158 }
159 
160 //------------------------------------------------------------------------------
161 // 1D interpolate transpose
162 //------------------------------------------------------------------------------
163 template <int NCOMP, int P1d, int Q1d>
164 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
165   for (CeedInt comp=0; comp<NCOMP; comp++)
166     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
167 }
168 
169 //------------------------------------------------------------------------------
170 // 1D derivatives at quadrature points
171 //------------------------------------------------------------------------------
172 template <int NCOMP, int P1d, int Q1d>
173 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
174   for (CeedInt comp = 0; comp < NCOMP; comp++)
175     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
176 }
177 
178 //------------------------------------------------------------------------------
179 // 1D derivatives transpose
180 //------------------------------------------------------------------------------
181 template <int NCOMP, int P1d, int Q1d>
182 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
183   for (CeedInt comp = 0; comp < NCOMP; comp++)
184     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
185 }
186 
187 //------------------------------------------------------------------------------
188 // 2D
189 //------------------------------------------------------------------------------
190 
191 //------------------------------------------------------------------------------
192 // L-vector -> E-vector, offsets provided
193 //------------------------------------------------------------------------------
194 template <int NCOMP, int COMPSTRIDE, int P1d>
195 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
196   if (data.tidx < P1d && data.tidy < P1d) {
197     const CeedInt node = data.tidx + data.tidy*P1d;
198     const CeedInt ind = indices[node + elem * P1d*P1d];
199     for (CeedInt comp = 0; comp < NCOMP; ++comp)
200       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
201   }
202 }
203 
204 //------------------------------------------------------------------------------
205 // L-vector -> E-vector, strided
206 //------------------------------------------------------------------------------
207 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
208 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
209   if (data.tidx < P1d && data.tidy < P1d) {
210     const CeedInt node = data.tidx + data.tidy*P1d;
211     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
212     for (CeedInt comp = 0; comp < NCOMP; ++comp)
213       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
214   }
215 }
216 
217 //------------------------------------------------------------------------------
218 // E-vector -> L-vector, offsets provided
219 //------------------------------------------------------------------------------
220 template <int NCOMP, int COMPSTRIDE, int P1d>
221 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
222   if (data.tidx < P1d && data.tidy < P1d) {
223     const CeedInt node = data.tidx + data.tidy*P1d;
224     const CeedInt ind = indices[node + elem * P1d*P1d];
225     for (CeedInt comp = 0; comp < NCOMP; ++comp)
226       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
227   }
228 }
229 
230 //------------------------------------------------------------------------------
231 // E-vector -> L-vector, strided
232 //------------------------------------------------------------------------------
233 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
234 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
235   if (data.tidx < P1d && data.tidy < P1d) {
236     const CeedInt node = data.tidx + data.tidy*P1d;
237     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
238     for (CeedInt comp = 0; comp < NCOMP; ++comp)
239       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
240   }
241 }
242 
243 //------------------------------------------------------------------------------
244 // 2D tensor contraction x
245 //------------------------------------------------------------------------------
246 template <int NCOMP, int P1d, int Q1d>
247 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
248   data.slice[data.tidx+data.tidy*Q1d] = *U;
249   __syncthreads();
250   *V = 0.0;
251   for (CeedInt i = 0; i < P1d; ++i)
252     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
253   __syncthreads();
254 }
255 
256 //------------------------------------------------------------------------------
257 // 2D tensor contract y
258 //------------------------------------------------------------------------------
259 template <int NCOMP, int P1d, int Q1d>
260 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
261   data.slice[data.tidx+data.tidy*Q1d] = *U;
262   __syncthreads();
263   *V = 0.0;
264   for (CeedInt i = 0; i < P1d; ++i)
265     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
266   __syncthreads();
267 }
268 
269 //------------------------------------------------------------------------------
270 // 2D transpose tensor contract y
271 //------------------------------------------------------------------------------
272 template <int NCOMP, int P1d, int Q1d>
273 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
274   data.slice[data.tidx+data.tidy*Q1d] = *U;
275   __syncthreads();
276   *V = 0.0;
277   if (data.tidy < P1d)
278     for (CeedInt i = 0; i < Q1d; ++i)
279       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
280   __syncthreads();
281 }
282 
283 //------------------------------------------------------------------------------
284 // 2D transpose tensor contract x
285 //------------------------------------------------------------------------------
286 template <int NCOMP, int P1d, int Q1d>
287 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
288   data.slice[data.tidx+data.tidy*Q1d] = *U;
289   __syncthreads();
290   *V = 0.0;
291   if (data.tidx < P1d)
292     for (CeedInt i = 0; i < Q1d; ++i)
293       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
294   __syncthreads();
295 }
296 
297 //------------------------------------------------------------------------------
298 // 2D transpose tensor contract and add x
299 //------------------------------------------------------------------------------
300 template <int NCOMP, int P1d, int Q1d>
301 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
302   data.slice[data.tidx+data.tidy*Q1d] = *U;
303   __syncthreads();
304   if (data.tidx < P1d)
305     for (CeedInt i = 0; i < Q1d; ++i)
306       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
307   __syncthreads();
308 }
309 
310 //------------------------------------------------------------------------------
311 // 2D interpolate to quadrature points
312 //------------------------------------------------------------------------------
313 template <int NCOMP, int P1d, int Q1d>
314 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
315   CeedScalar r_t[1];
316   for (CeedInt comp = 0; comp < NCOMP; comp++) {
317     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
318     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
319   }
320 }
321 
322 //------------------------------------------------------------------------------
323 // 2D interpolate transpose
324 //------------------------------------------------------------------------------
325 template <int NCOMP, int P1d, int Q1d>
326 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
327   CeedScalar r_t[1];
328   for (CeedInt comp = 0; comp < NCOMP; comp++) {
329     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
330     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
331   }
332 }
333 
334 //------------------------------------------------------------------------------
335 // 2D derivatives at quadrature points
336 //------------------------------------------------------------------------------
337 template <int NCOMP, int P1d, int Q1d>
338 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
339   CeedScalar r_t[1];
340   for (CeedInt comp = 0; comp < NCOMP; comp++) {
341     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
342     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
343     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
344     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
345   }
346 }
347 
348 //------------------------------------------------------------------------------
349 // 2D derivatives transpose
350 //------------------------------------------------------------------------------
351 template <int NCOMP, int P1d, int Q1d>
352 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
353   CeedScalar r_t[1];
354   for (CeedInt comp = 0; comp < NCOMP; comp++) {
355     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
356     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
357     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
358     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
359   }
360 }
361 
362 //------------------------------------------------------------------------------
363 // 3D
364 //------------------------------------------------------------------------------
365 
366 //------------------------------------------------------------------------------
367 // L-vector -> E-vector, offsets provided
368 //------------------------------------------------------------------------------
369 template <int NCOMP, int COMPSTRIDE, int P1d>
370 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
371   if (data.tidx < P1d && data.tidy < P1d)
372     for (CeedInt z = 0; z < P1d; ++z) {
373       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
374       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
375       for (CeedInt comp = 0; comp < NCOMP; ++comp)
376         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
377     }
378 }
379 
380 //------------------------------------------------------------------------------
381 // L-vector -> E-vector, strided
382 //------------------------------------------------------------------------------
383 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
384 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
385   if (data.tidx < P1d && data.tidy < P1d)
386     for (CeedInt z = 0; z < P1d; ++z) {
387       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
388       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
389       for (CeedInt comp = 0; comp < NCOMP; ++comp)
390         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
391     }
392 }
393 
394 //------------------------------------------------------------------------------
395 // E-vector -> Q-vector, offests provided
396 //------------------------------------------------------------------------------
397 template <int NCOMP, int COMPSTRIDE, int Q1d>
398 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
399   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
400   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
401   for (CeedInt comp = 0; comp < NCOMP; ++comp)
402     r_u[comp] = d_u[ind + COMPSTRIDE * comp];
403 }
404 
405 //------------------------------------------------------------------------------
406 // E-vector -> Q-vector, strided
407 //------------------------------------------------------------------------------
408 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
409 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
410   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
411   const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
412   for (CeedInt comp = 0; comp < NCOMP; ++comp)
413     r_u[comp] = d_u[ind + comp * STRIDES_COMP];
414 }
415 
416 //------------------------------------------------------------------------------
417 // E-vector -> L-vector, offsets provided
418 //------------------------------------------------------------------------------
419 template <int NCOMP, int COMPSTRIDE, int P1d>
420 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
421   if (data.tidx < P1d && data.tidy < P1d)
422     for (CeedInt z = 0; z < P1d; ++z) {
423       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
424       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
425       for (CeedInt comp = 0; comp < NCOMP; ++comp)
426         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
427     }
428 }
429 
430 //------------------------------------------------------------------------------
431 // E-vector -> L-vector, strided
432 //------------------------------------------------------------------------------
433 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
434 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
435   if (data.tidx < P1d && data.tidy < P1d)
436     for (CeedInt z = 0; z < P1d; ++z) {
437       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
438       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
439       for (CeedInt comp = 0; comp < NCOMP; ++comp)
440         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
441     }
442 }
443 
444 //------------------------------------------------------------------------------
445 // 3D tensor contract x
446 //------------------------------------------------------------------------------
447 template <int NCOMP, int P1d, int Q1d>
448 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
449   CeedScalar r_B[P1d];
450   for (CeedInt i = 0; i < P1d; ++i)
451     r_B[i] = B[i + data.tidx*P1d];
452 
453   for (CeedInt k = 0; k < P1d; ++k) {
454     data.slice[data.tidx+data.tidy*Q1d] = U[k];
455     __syncthreads();
456     V[k] = 0.0;
457     for (CeedInt i = 0; i < P1d; ++i)
458       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
459     __syncthreads();
460   }
461 }
462 
463 //------------------------------------------------------------------------------
464 // 3D tensor contract y
465 //------------------------------------------------------------------------------
466 template <int NCOMP, int P1d, int Q1d>
467 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
468   CeedScalar r_B[P1d];
469   for (CeedInt i = 0; i < P1d; ++i)
470     r_B[i] = B[i + data.tidy*P1d];
471 
472   for (CeedInt k = 0; k < P1d; ++k) {
473     data.slice[data.tidx+data.tidy*Q1d] = U[k];
474     __syncthreads();
475     V[k] = 0.0;
476     for (CeedInt i = 0; i < P1d; ++i)
477       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
478     __syncthreads();
479   }
480 }
481 
482 //------------------------------------------------------------------------------
483 // 3D tensor contract z
484 //------------------------------------------------------------------------------
485 template <int NCOMP, int P1d, int Q1d>
486 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
487   for (CeedInt k = 0; k < Q1d; ++k) {
488     V[k] = 0.0;
489     for (CeedInt i = 0; i < P1d; ++i)
490       V[k] += B[i + k*P1d] * U[i]; // Contract z direction
491   }
492 }
493 
494 //------------------------------------------------------------------------------
495 // 3D transpose tensor contract z
496 //------------------------------------------------------------------------------
497 template <int NCOMP, int P1d, int Q1d>
498 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
499   for (CeedInt k = 0; k < Q1d; ++k) {
500     V[k] = 0.0;
501     if (k < P1d)
502       for (CeedInt i = 0; i < Q1d; ++i)
503         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
504   }
505 }
506 
507 //------------------------------------------------------------------------------
508 // 3D transpose tensor contract y
509 //------------------------------------------------------------------------------
510 template <int NCOMP, int P1d, int Q1d>
511 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
512   CeedScalar r_B[Q1d];
513   for (CeedInt i = 0; i < Q1d; ++i)
514     r_B[i] = B[data.tidy + i*P1d];
515 
516   for (CeedInt k = 0; k < P1d; ++k) {
517     data.slice[data.tidx+data.tidy*Q1d] = U[k];
518     __syncthreads();
519     V[k] = 0.0;
520     if (data.tidy < P1d)
521       for (CeedInt i = 0; i < Q1d; ++i)
522         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
523     __syncthreads();
524   }
525 }
526 
527 //------------------------------------------------------------------------------
528 // 3D transpose tensor contract add y
529 //------------------------------------------------------------------------------
530 template <int NCOMP, int P1d, int Q1d>
531 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
532   CeedScalar r_B[Q1d];
533   for (CeedInt i = 0; i < Q1d; ++i)
534     r_B[i] = B[data.tidy + i*P1d];
535 
536   for (CeedInt k = 0; k < P1d; ++k) {
537     data.slice[data.tidx+data.tidy*Q1d] = U[k];
538     __syncthreads();
539     if (data.tidy < P1d)
540       for (CeedInt i = 0; i < Q1d; ++i)
541         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
542     __syncthreads();
543   }
544 }
545 
546 //------------------------------------------------------------------------------
547 // 3D transpose tensor contract x
548 //------------------------------------------------------------------------------
549 template <int NCOMP, int P1d, int Q1d>
550 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
551   CeedScalar r_B[Q1d];
552   for (CeedInt i = 0; i < Q1d; ++i)
553     r_B[i] = B[data.tidx + i*P1d];
554 
555   for (CeedInt k = 0; k < P1d; ++k) {
556     data.slice[data.tidx+data.tidy*Q1d] = U[k];
557     __syncthreads();
558     V[k] = 0.0;
559     if (data.tidx < P1d)
560       for (CeedInt i = 0; i < Q1d; ++i)
561         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
562     __syncthreads();
563   }
564 }
565 
566 //------------------------------------------------------------------------------
567 // 3D transpose tensor contract add x
568 //------------------------------------------------------------------------------
569 template <int NCOMP, int P1d, int Q1d>
570 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
571   CeedScalar r_B[Q1d];
572   for (CeedInt i = 0; i < Q1d; ++i)
573     r_B[i] = B[data.tidx + i*P1d];
574 
575   for (CeedInt k = 0; k < P1d; ++k) {
576     data.slice[data.tidx+data.tidy*Q1d] = U[k];
577     __syncthreads();
578     if (data.tidx < P1d)
579       for (CeedInt i = 0; i < Q1d; ++i)
580         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
581     __syncthreads();
582   }
583 }
584 
585 //------------------------------------------------------------------------------
586 // 3D interpolate to quadrature points
587 //------------------------------------------------------------------------------
588 template <int NCOMP, int P1d, int Q1d>
589 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
590   CeedScalar r_t1[Q1d];
591   CeedScalar r_t2[Q1d];
592   for (CeedInt comp = 0; comp < NCOMP; comp++) {
593     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
594     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
595     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
596   }
597 }
598 
599 //------------------------------------------------------------------------------
600 // 3D interpolate transpose
601 //------------------------------------------------------------------------------
602 template <int NCOMP, int P1d, int Q1d>
603 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
604   CeedScalar r_t1[Q1d];
605   CeedScalar r_t2[Q1d];
606   for (CeedInt comp = 0; comp < NCOMP; comp++) {
607     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
608     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
609     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
610   }
611 }
612 
613 //------------------------------------------------------------------------------
614 // 3D derivatives at quadrature points
615 //------------------------------------------------------------------------------
616 template <int NCOMP, int P1d, int Q1d>
617 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
618   CeedScalar r_t1[Q1d];
619   CeedScalar r_t2[Q1d];
620   for (CeedInt comp = 0; comp < NCOMP; comp++) {
621     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
622     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
623     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
624     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
625     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
626     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
627     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
628     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
629     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
630   }
631 }
632 
633 //------------------------------------------------------------------------------
634 // 3D derivatives transpose
635 //------------------------------------------------------------------------------
636 template <int NCOMP, int P1d, int Q1d>
637 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
638   CeedScalar r_t1[Q1d];
639   CeedScalar r_t2[Q1d];
640   for (CeedInt comp = 0; comp < NCOMP; comp++) {
641     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
642     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
643     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
644     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
645     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
646     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
647     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
648     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
649     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
650   }
651 }
652 
653 //------------------------------------------------------------------------------
654 // 3D collocated derivatives computation
655 //------------------------------------------------------------------------------
656 template <int NCOMP, int Q1d>
657 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
658   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
659     data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d];
660     __syncthreads();
661     // X derivative
662     r_V[comp+0*NCOMP] = 0.0;
663     for (CeedInt i = 0; i < Q1d; ++i)
664       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
665     // Y derivative
666     r_V[comp+1*NCOMP] = 0.0;
667     for (CeedInt i = 0; i < Q1d; ++i)
668       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
669     // Z derivative
670     r_V[comp+2*NCOMP] = 0.0;
671     for (CeedInt i = 0; i < Q1d; ++i)
672       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
673     __syncthreads();
674   }
675 }
676 
677 //------------------------------------------------------------------------------
678 // 3D collocated derivatives transpose
679 //------------------------------------------------------------------------------
680 template <int NCOMP, int Q1d>
681 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
682   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
683     // X derivative
684     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP];
685     __syncthreads();
686     for (CeedInt i = 0; i < Q1d; ++i)
687       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
688     __syncthreads();
689     // Y derivative
690     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP];
691     __syncthreads();
692     for (CeedInt i = 0; i < Q1d; ++i)
693       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
694     __syncthreads();
695     // Z derivative
696     for (CeedInt i = 0; i < Q1d; ++i)
697       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
698   }
699 }
700 
701 //------------------------------------------------------------------------------
702 // 1D quadrature weights
703 //------------------------------------------------------------------------------
704 template <int Q1d>
705 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
706   *w = qweight1d[data.tidx];
707 }
708 
709 //------------------------------------------------------------------------------
710 // 2D quadrature weights
711 //------------------------------------------------------------------------------
712 template <int Q1d>
713 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
714   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
715 }
716 
717 //------------------------------------------------------------------------------
718 // 3D quadrature weights
719 //------------------------------------------------------------------------------
720 template <int Q1d>
721 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
722   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
723   for (CeedInt z = 0; z < Q1d; ++z)
724     w[z] = pw*qweight1d[z];
725 }
726 
727 );
728 //------------------------------------------------------------------------------
729 // Build singe operator kernel
730 //------------------------------------------------------------------------------
731 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
732 
733   using std::ostringstream;
734   using std::string;
735   int ierr;
736   bool setupdone;
737   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
738   if (setupdone) return 0;
739   Ceed ceed;
740   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
741   CeedOperator_Cuda_gen *data;
742   ierr = CeedOperatorGetData(op, &data); CeedChk(ierr);
743   CeedQFunction qf;
744   CeedQFunction_Cuda_gen *qf_data;
745   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
746   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr);
747   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
748           numoutputfields, ncomp, dim = 0, lsize;
749   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
750   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
751   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
752   CeedChk(ierr);
753   CeedOperatorField *opinputfields, *opoutputfields;
754   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
755   CeedChk(ierr);
756   CeedQFunctionField *qfinputfields, *qfoutputfields;
757   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
758   CeedChk(ierr);
759   CeedEvalMode emode;
760   CeedBasis basis;
761   CeedBasis_Cuda_shared *basis_data;
762   CeedElemRestriction Erestrict;
763   CeedElemRestriction_Cuda_reg *restr_data;
764 
765   ostringstream code;
766   string devFunctions(deviceFunctions);
767 
768   // Add atomicAdd function for old NVidia architectures
769   struct cudaDeviceProp prop;
770   Ceed_Cuda *ceed_data;
771   ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr);
772   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
773   if (prop.major<6){
774     code << atomicAdd;
775   }
776 
777   code << devFunctions;
778 
779   string qFunction(qf_data->qFunctionSource);
780   string qFunctionName(qf_data->qFunctionName);
781   string oper;
782   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
783 
784   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
785   code << "\n#define CeedPragmaSIMD\n";
786 
787   // Find dim and Q1d
788   bool useCollograd = true;
789   for (CeedInt i = 0; i < numinputfields; i++) {
790     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
791     if (basis != CEED_BASIS_COLLOCATED) {
792       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
793       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
794       CeedChk(ierr);
795 
796       // Check for collocated gradient
797       if (emode == CEED_EVAL_GRAD)
798         useCollograd = useCollograd && !!basis_data->d_collograd1d;
799 
800       // Collect dim and Q1d
801       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
802       bool isTensor;
803       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
804       if (isTensor) {
805         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
806       } else {
807         // LCOV_EXCL_START
808         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
809         // LCOV_EXCL_STOP
810         }
811     }
812   }
813   // Check output bases for Q1d, dim as well
814   //   The only imput basis might be CEED_BASIS_COLLOCATED
815   for (CeedInt i = 0; i < numoutputfields; i++) {
816     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
817 
818     if (basis != CEED_BASIS_COLLOCATED) {
819       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
820       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
821       CeedChk(ierr);
822 
823       // Collect dim and Q1d
824       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
825       bool isTensor;
826       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
827       if (isTensor) {
828         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
829       } else {
830         // LCOV_EXCL_START
831         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
832         // LCOV_EXCL_STOP
833         }
834 
835       // Check for collocated gradient
836       if (emode == CEED_EVAL_GRAD)
837         useCollograd = useCollograd && basis_data->d_collograd1d;
838     }
839   }
840   data->dim = dim;
841   data->Q1d = Q1d;
842 
843   // Define CEED_Q_VLA
844   if (dim != 3 || useCollograd) {
845     code << "\n#define CEED_Q_VLA 1\n\n";
846   } else {
847     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
848   }
849 
850   code << qFunction;
851 
852   // Setup
853   code << "\n// -----------------------------------------------------------------------------\n";
854   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
855   for (CeedInt i = 0; i < numinputfields; i++) {
856     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
857     CeedChk(ierr);
858     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
859       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
860     }
861   }
862 
863   for (CeedInt i = 0; i < numoutputfields; i++) {
864     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
865   }
866 
867   code << "  const CeedInt Dim = "<<dim<<";\n";
868   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
869 
870   code << "  extern __shared__ CeedScalar slice[];\n";
871   code << "  BackendData data;\n";
872   code << "  data.tidx = threadIdx.x;\n";
873   code << "  data.tidy = threadIdx.y;\n";
874   code << "  data.tidz = threadIdx.z;\n";
875   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
876   code << "  data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
877 
878   code << "\n  // -- Input field constants and basis data --\n";
879   //Initialize constants, and matrices B and G
880   for (CeedInt i = 0; i < numinputfields; i++) {
881     code << "  // ---- Input field "<<i<<" ----\n";
882     // Get elemsize, emode, ncomp
883     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
884     CeedChk(ierr);
885     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
886     CeedChk(ierr);
887     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
888     CeedChk(ierr);
889     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
890     CeedChk(ierr);
891 
892     // Set field constants
893     if (emode != CEED_EVAL_WEIGHT) {
894       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
895       if (basis != CEED_BASIS_COLLOCATED) {
896         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
897         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
898       } else {
899         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
900       }
901       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
902     }
903 
904     // Load basis data
905     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
906     switch (emode) {
907     case CEED_EVAL_NONE:
908       break;
909     case CEED_EVAL_INTERP:
910       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
911       data->B.in[i] = basis_data->d_interp1d;
912       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
913       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
914       break;
915     case CEED_EVAL_GRAD:
916       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
917       data->B.in[i] = basis_data->d_interp1d;
918       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
919       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
920       if (useCollograd) {
921         data->G.in[i] = basis_data->d_collograd1d;
922         code << "  __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
923         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
924       } else {
925         data->G.in[i] = basis_data->d_grad1d;
926         code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
927         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
928       }
929       break;
930     case CEED_EVAL_WEIGHT:
931       break; // No action
932     case CEED_EVAL_DIV:
933       break; // TODO: Not implemented
934     case CEED_EVAL_CURL:
935       break; // TODO: Not implemented
936     }
937   }
938 
939   code << "\n  // -- Output field constants and basis data --\n";
940   for (CeedInt i = 0; i < numoutputfields; i++) {
941     code << "  // ---- Output field "<<i<<" ----\n";
942     // Get elemsize, emode, ncomp
943     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
944     CeedChk(ierr);
945     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
946     CeedChk(ierr);
947     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
948     CeedChk(ierr);
949     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
950     CeedChk(ierr);
951 
952     // Set field constants
953     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
954     if (basis != CEED_BASIS_COLLOCATED) {
955       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
956       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
957     } else {
958       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
959     }
960     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
961 
962     // Load basis data
963     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
964     switch (emode) {
965     case CEED_EVAL_NONE:
966       break; // No action
967     case CEED_EVAL_INTERP:
968       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
969       data->B.out[i] = basis_data->d_interp1d;
970       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
971       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
972       break;
973     case CEED_EVAL_GRAD:
974       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
975       data->B.out[i] = basis_data->d_interp1d;
976       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
977       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
978       if (useCollograd) {
979         data->G.out[i] = basis_data->d_collograd1d;
980         code << "  __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
981         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
982       } else {
983         data->G.out[i] = basis_data->d_grad1d;
984         code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
985         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
986       }
987       break;
988     // LCOV_EXCL_START
989     case CEED_EVAL_WEIGHT: {
990       Ceed ceed;
991       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
992       return CeedError(ceed, 1,
993                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
994       break; // Should not occur
995     }
996     case CEED_EVAL_DIV:
997       break; // TODO: Not implemented
998     case CEED_EVAL_CURL:
999       break; // TODO: Not implemented
1000       // LCOV_EXCL_STOP
1001     }
1002   }
1003   code << "\n  // -- Element loop --\n";
1004   code << "  __syncthreads();\n";
1005   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1006   // Input basis apply if needed
1007   // Generate the correct eval mode code for each input
1008   code << "    // -- Input field restrictions and basis actions --\n";
1009   for (CeedInt i = 0; i < numinputfields; i++) {
1010     code << "    // ---- Input field "<<i<<" ----\n";
1011     // Get elemsize, emode, ncomp
1012     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1013     CeedChk(ierr);
1014     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1015     CeedChk(ierr);
1016     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1017     CeedChk(ierr);
1018     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1019     CeedChk(ierr);
1020 
1021     // Restriction
1022     if (emode != CEED_EVAL_WEIGHT &&
1023         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1024       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1025 
1026       bool isStrided;
1027       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1028       if (!isStrided) {
1029         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1030         CeedChk(ierr);
1031         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1032         CeedInt compstride;
1033         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1034         code << "    // CompStride: "<<compstride<<"\n";
1035         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1036         data->indices.in[i] = restr_data->d_ind;
1037         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1038       } else {
1039         bool backendstrides;
1040         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1041         CeedChk(ierr);
1042         CeedInt nelem;
1043         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1044         CeedChk(ierr);
1045         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1046         if (!backendstrides) {
1047           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1048           CeedChk(ierr);
1049         }
1050         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1051         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1052       }
1053     }
1054 
1055     // Basis action
1056     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1057     switch (emode) {
1058     case CEED_EVAL_NONE:
1059       if (!useCollograd) {
1060         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1061       }
1062       break;
1063     case CEED_EVAL_INTERP:
1064       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1065       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1066       break;
1067     case CEED_EVAL_GRAD:
1068       if (useCollograd) {
1069         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1070         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1071       } else {
1072         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1073         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1074       }
1075       break;
1076     case CEED_EVAL_WEIGHT:
1077       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1078       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1079       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
1080       data->W = basis_data->d_qweight1d;
1081       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1082       break; // No action
1083     case CEED_EVAL_DIV:
1084       break; // TODO: Not implemented
1085     case CEED_EVAL_CURL:
1086       break; // TODO: Not implemented
1087     }
1088   }
1089 
1090   // Q function
1091   code << "\n    // -- Output field setup --\n";
1092   for (CeedInt i = 0; i < numoutputfields; i++) {
1093       code << "\n    // ---- Output field "<<i<<" ----\n";
1094     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1095     CeedChk(ierr);
1096     if (emode==CEED_EVAL_GRAD)
1097     {
1098       if (useCollograd) {
1099         //Accumulator for gradient slices
1100         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1101         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1102         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1103         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1104         code << "      }\n";
1105         code << "    }\n";
1106       } else {
1107         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1108       }
1109     }
1110     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1111     {
1112       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1113     }
1114   }
1115   // We treat quadrature points per slice in 3d to save registers
1116   if (useCollograd) {
1117     code << "\n    // Note: Collocated Gradient\n";
1118     code << "#pragma unroll\n";
1119     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1120     code << "      // -- Input fields --\n";
1121     for (CeedInt i = 0; i < numinputfields; i++) {
1122       code << "      // ---- Input field "<<i<<" ----\n";
1123       // Get elemsize, emode, ncomp
1124       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1125       CeedChk(ierr);
1126       // Basis action
1127       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1128       switch (emode) {
1129       case CEED_EVAL_NONE:
1130         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1131 
1132         bool isStrided;
1133         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr);
1134         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr);
1135         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1136         if (!isStrided) {
1137           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1138           CeedChk(ierr);
1139           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1140           CeedInt compstride;
1141           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1142           code << "      // CompStride: "<<compstride<<"\n";
1143           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1144           data->indices.in[i] = restr_data->d_ind;
1145           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1146         } else {
1147           bool backendstrides;
1148           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1149           CeedChk(ierr);
1150           CeedInt nelem;
1151           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1152           CeedChk(ierr);
1153           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1154           if (!backendstrides) {
1155             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1156             CeedChk(ierr);
1157           }
1158           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1159           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1160         }
1161         break;
1162       case CEED_EVAL_INTERP:
1163         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1164         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1165         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1166         code << "      }\n";
1167         break;
1168       case CEED_EVAL_GRAD:
1169         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1170         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1171         break;
1172       case CEED_EVAL_WEIGHT:
1173         code << "      CeedScalar r_q"<<i<<"[1];\n";
1174         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1175         break; // No action
1176       case CEED_EVAL_DIV:
1177         break; // TODO: Not implemented
1178       case CEED_EVAL_CURL:
1179         break; // TODO: Not implemented
1180       }
1181     }
1182     code << "\n      // -- Output fields --\n";
1183     for (CeedInt i = 0; i < numoutputfields; i++) {
1184       code << "      // ---- Output field "<<i<<" ----\n";
1185       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1186       CeedChk(ierr);
1187       // Basis action
1188       switch (emode) {
1189       case CEED_EVAL_NONE:
1190         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1191         break; // No action
1192       case CEED_EVAL_INTERP:
1193         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1194         break;
1195       case CEED_EVAL_GRAD:
1196         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1197         break;
1198       case CEED_EVAL_WEIGHT:
1199         break; // Should not occur
1200       case CEED_EVAL_DIV:
1201         break; // TODO: Not implemented
1202       case CEED_EVAL_CURL:
1203         break; // TODO: Not implemented
1204       }
1205     }
1206   } else {
1207     code << "\n      // Note: No Collocated Gradient\n";
1208     code << "      // -- Input fields --\n";
1209     for (CeedInt i = 0; i < numinputfields; i++) {
1210       code << "      // ---- Input field "<<i<<" ----\n";
1211       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1212     }
1213     code << "      // -- Output fields --\n";
1214     for (CeedInt i = 0; i < numoutputfields; i++) {
1215       code << "      // ---- Output field "<<i<<" ----\n";
1216       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1217     }
1218   }
1219   code << "\n      // -- QFunction Inputs and outputs --\n";
1220   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1221   for (CeedInt i = 0; i < numinputfields; i++) {
1222     code << "      // ---- Input field "<<i<<" ----\n";
1223     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1224   }
1225   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1226   for (CeedInt i = 0; i < numoutputfields; i++) {
1227     code << "      // ---- Output field "<<i<<" ----\n";
1228     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1229   }
1230   code << "\n      // -- Apply QFunction --\n";
1231   code << "      "<<qFunctionName<<"(ctx, ";
1232   if (dim != 3 || useCollograd) {
1233     code << "1";
1234   } else {
1235     code << "Q1d";
1236   }
1237   code << ", in, out);\n";
1238   if (useCollograd) {
1239     code << "\n      // Note: Collocated Gradient\n";
1240     code << "      // -- Output fields --\n";
1241     for (CeedInt i = 0; i < numoutputfields; i++) {
1242       code << "      // ---- Output field "<<i<<" ----\n";
1243       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1244       CeedChk(ierr);
1245       // Basis action
1246       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1247       switch (emode) {
1248       case CEED_EVAL_NONE:
1249         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1250         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1251         code << "      }\n";
1252         break; // No action
1253       case CEED_EVAL_INTERP:
1254         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1255         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1256         code << "      }\n";
1257         break;
1258       case CEED_EVAL_GRAD:
1259         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1260         break;
1261       case CEED_EVAL_WEIGHT:
1262         break; // Should not occur
1263       case CEED_EVAL_DIV:
1264         break; // TODO: Not implemented
1265       case CEED_EVAL_CURL:
1266         break; // TODO: Not implemented
1267       }
1268     }
1269     code << "    }\n";
1270   }
1271 
1272   // Output basis apply if needed
1273   // Generate the correct eval mode code for each output
1274   code << "\n    // -- Output field basis action and restrictions --\n";
1275   for (CeedInt i = 0; i < numoutputfields; i++) {
1276     code << "    // ---- Output field "<<i<<" ----\n";
1277     // Get elemsize, emode, ncomp
1278     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1279     CeedChk(ierr);
1280     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1281     CeedChk(ierr);
1282     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1283     CeedChk(ierr);
1284     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1285     CeedChk(ierr);
1286     // Basis action
1287     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1288     switch (emode) {
1289     case CEED_EVAL_NONE:
1290       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1291       break; // No action
1292     case CEED_EVAL_INTERP:
1293       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1294       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1295       break;
1296     case CEED_EVAL_GRAD:
1297       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1298       if (useCollograd) {
1299         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1300       } else {
1301         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1302       }
1303       break;
1304     // LCOV_EXCL_START
1305     case CEED_EVAL_WEIGHT: {
1306       Ceed ceed;
1307       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1308       return CeedError(ceed, 1,
1309                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1310       break; // Should not occur
1311     }
1312     case CEED_EVAL_DIV:
1313       break; // TODO: Not implemented
1314     case CEED_EVAL_CURL:
1315       break; // TODO: Not implemented
1316       // LCOV_EXCL_STOP
1317     }
1318     // Restriction
1319       bool isStrided;
1320       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1321     if (!isStrided) {
1322       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1323       CeedChk(ierr);
1324       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1325       CeedInt compstride;
1326       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1327       code << "    // CompStride: "<<compstride<<"\n";
1328       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1329       data->indices.out[i] = restr_data->d_ind;
1330       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1331     } else {
1332       bool backendstrides;
1333       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1334       CeedChk(ierr);
1335       CeedInt nelem;
1336       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1337       CeedChk(ierr);
1338       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1339       if (!backendstrides) {
1340         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1341         CeedChk(ierr);
1342       }
1343       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1344       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1345     }
1346   }
1347 
1348   code << "  }\n";
1349   code << "}\n";
1350   code << "// -----------------------------------------------------------------------------\n\n";
1351 
1352   // View kernel for debugging
1353   CeedDebug(code.str().c_str());
1354 
1355   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0);
1356   CeedChk(ierr);
1357   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1358   CeedChk(ierr);
1359 
1360   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1361   return 0;
1362 }
1363 //------------------------------------------------------------------------------
1364