xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision c3c443fa4dd888aca94af004b304a7a213962033)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include "ceed-cuda-gen.h"
18 #include <iostream>
19 #include <sstream>
20 #include "../cuda-reg/ceed-cuda-reg.h"
21 #include "../cuda-shared/ceed-cuda-shared.h"
22 
23 static const char *atomicAdd = QUOTE(
24 //------------------------------------------------------------------------------
25 // Atomic add, for older CUDA
26 //------------------------------------------------------------------------------
27 __device__ double atomicAdd(double *address, double val) {
28   unsigned long long int *address_as_ull = (unsigned long long int *)address;
29   unsigned long long int old = *address_as_ull, assumed;
30   do {
31     assumed = old;
32     old =
33       atomicCAS(address_as_ull, assumed,
34                 __double_as_longlong(val +
35                                      __longlong_as_double(assumed)));
36     // Note: uses integer comparison to avoid hang in case of NaN
37     // (since NaN != NaN)
38   } while (assumed != old);
39   return __longlong_as_double(old);
40 }
41 );
42 
43 static const char *deviceFunctions = QUOTE(
44 
45 //------------------------------------------------------------------------------
46 // Typedefs
47 //------------------------------------------------------------------------------
48 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
49 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
50 
51 typedef struct {
52   CeedInt tidx;
53   CeedInt tidy;
54   CeedInt tidz;
55   CeedInt tid;
56   CeedScalar* slice;
57 } BackendData;
58 
59 //------------------------------------------------------------------------------
60 // Load matrices for basis actions
61 //------------------------------------------------------------------------------
62 template <int P, int Q>
63 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
64   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
65     B[i] = d_B[i];
66 }
67 
68 //------------------------------------------------------------------------------
69 // 1D
70 //------------------------------------------------------------------------------
71 
72 //------------------------------------------------------------------------------
73 // L-vector -> E-vector, offsets provided
74 //------------------------------------------------------------------------------
75 template <int NCOMP, int COMPSTRIDE, int P1d>
76 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
77   if (data.tidx < P1d) {
78     const CeedInt node = data.tidx;
79     const CeedInt ind = indices[node + elem * P1d];
80     for (CeedInt comp = 0; comp < NCOMP; ++comp)
81       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
82   }
83 }
84 
85 //------------------------------------------------------------------------------
86 // L-vector -> E-vector, strided
87 //------------------------------------------------------------------------------
88 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
89 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
90   if (data.tidx < P1d) {
91     const CeedInt node = data.tidx;
92     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
93     for (CeedInt comp = 0; comp < NCOMP; ++comp)
94       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
95   }
96 }
97 
98 //------------------------------------------------------------------------------
99 // E-vector -> L-vector, offsets provided
100 //------------------------------------------------------------------------------
101 template <int NCOMP, int COMPSTRIDE, int P1d>
102 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
103   if (data.tidx < P1d) {
104     const CeedInt node = data.tidx;
105     const CeedInt ind = indices[node + elem * P1d];
106     for (CeedInt comp = 0; comp < NCOMP; ++comp)
107       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
108   }
109 }
110 
111 //------------------------------------------------------------------------------
112 // E-vector -> L-vector, strided
113 //------------------------------------------------------------------------------
114 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
115 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
116   if (data.tidx < P1d) {
117     const CeedInt node = data.tidx;
118     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
119     for (CeedInt comp = 0; comp < NCOMP; ++comp)
120       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
121   }
122 }
123 
124 //------------------------------------------------------------------------------
125 // 1D tensor contraction x
126 //------------------------------------------------------------------------------
127 template <int NCOMP, int P1d, int Q1d>
128 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
129   data.slice[data.tidx] = *U;
130   __syncthreads();
131   *V = 0.0;
132   for (CeedInt i = 0; i < P1d; ++i)
133     *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
134   __syncthreads();
135 }
136 
137 //------------------------------------------------------------------------------
138 // 1D transpose tensor contraction x
139 //------------------------------------------------------------------------------
140 template <int NCOMP, int P1d, int Q1d>
141 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
142   data.slice[data.tidx] = *U;
143   __syncthreads();
144   *V = 0.0;
145   for (CeedInt i = 0; i < Q1d; ++i)
146     *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
147   __syncthreads();
148 }
149 
150 //------------------------------------------------------------------------------
151 // 1D interpolate to quadrature points
152 //------------------------------------------------------------------------------
153 template <int NCOMP, int P1d, int Q1d>
154 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
155   for (CeedInt comp = 0; comp < NCOMP; comp++)
156     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
157 }
158 
159 //------------------------------------------------------------------------------
160 // 1D interpolate transpose
161 //------------------------------------------------------------------------------
162 template <int NCOMP, int P1d, int Q1d>
163 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
164   for (CeedInt comp=0; comp<NCOMP; comp++)
165     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
166 }
167 
168 //------------------------------------------------------------------------------
169 // 1D derivatives at quadrature points
170 //------------------------------------------------------------------------------
171 template <int NCOMP, int P1d, int Q1d>
172 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
173   for (CeedInt comp = 0; comp < NCOMP; comp++)
174     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
175 }
176 
177 //------------------------------------------------------------------------------
178 // 1D derivatives transpose
179 //------------------------------------------------------------------------------
180 template <int NCOMP, int P1d, int Q1d>
181 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
182   for (CeedInt comp = 0; comp < NCOMP; comp++)
183     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
184 }
185 
186 //------------------------------------------------------------------------------
187 // 2D
188 //------------------------------------------------------------------------------
189 
190 //------------------------------------------------------------------------------
191 // L-vector -> E-vector, offsets provided
192 //------------------------------------------------------------------------------
193 template <int NCOMP, int COMPSTRIDE, int P1d>
194 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
195   if (data.tidx < P1d && data.tidy < P1d) {
196     const CeedInt node = data.tidx + data.tidy*P1d;
197     const CeedInt ind = indices[node + elem * P1d*P1d];
198     for (CeedInt comp = 0; comp < NCOMP; ++comp)
199       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
200   }
201 }
202 
203 //------------------------------------------------------------------------------
204 // L-vector -> E-vector, strided
205 //------------------------------------------------------------------------------
206 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
207 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
208   if (data.tidx < P1d && data.tidy < P1d) {
209     const CeedInt node = data.tidx + data.tidy*P1d;
210     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
211     for (CeedInt comp = 0; comp < NCOMP; ++comp)
212       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
213   }
214 }
215 
216 //------------------------------------------------------------------------------
217 // E-vector -> L-vector, offsets provided
218 //------------------------------------------------------------------------------
219 template <int NCOMP, int COMPSTRIDE, int P1d>
220 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
221   if (data.tidx < P1d && data.tidy < P1d) {
222     const CeedInt node = data.tidx + data.tidy*P1d;
223     const CeedInt ind = indices[node + elem * P1d*P1d];
224     for (CeedInt comp = 0; comp < NCOMP; ++comp)
225       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
226   }
227 }
228 
229 //------------------------------------------------------------------------------
230 // E-vector -> L-vector, strided
231 //------------------------------------------------------------------------------
232 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
233 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
234   if (data.tidx < P1d && data.tidy < P1d) {
235     const CeedInt node = data.tidx + data.tidy*P1d;
236     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
237     for (CeedInt comp = 0; comp < NCOMP; ++comp)
238       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
239   }
240 }
241 
242 //------------------------------------------------------------------------------
243 // 2D tensor contraction x
244 //------------------------------------------------------------------------------
245 template <int NCOMP, int P1d, int Q1d>
246 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
247   data.slice[data.tidx+data.tidy*Q1d] = *U;
248   __syncthreads();
249   *V = 0.0;
250   for (CeedInt i = 0; i < P1d; ++i)
251     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
252   __syncthreads();
253 }
254 
255 //------------------------------------------------------------------------------
256 // 2D tensor contract y
257 //------------------------------------------------------------------------------
258 template <int NCOMP, int P1d, int Q1d>
259 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
260   data.slice[data.tidx+data.tidy*Q1d] = *U;
261   __syncthreads();
262   *V = 0.0;
263   for (CeedInt i = 0; i < P1d; ++i)
264     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
265   __syncthreads();
266 }
267 
268 //------------------------------------------------------------------------------
269 // 2D transpose tensor contract y
270 //------------------------------------------------------------------------------
271 template <int NCOMP, int P1d, int Q1d>
272 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
273   data.slice[data.tidx+data.tidy*Q1d] = *U;
274   __syncthreads();
275   *V = 0.0;
276   if (data.tidy < P1d)
277     for (CeedInt i = 0; i < Q1d; ++i)
278       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
279   __syncthreads();
280 }
281 
282 //------------------------------------------------------------------------------
283 // 2D transpose tensor contract x
284 //------------------------------------------------------------------------------
285 template <int NCOMP, int P1d, int Q1d>
286 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
287   data.slice[data.tidx+data.tidy*Q1d] = *U;
288   __syncthreads();
289   *V = 0.0;
290   if (data.tidx < P1d)
291     for (CeedInt i = 0; i < Q1d; ++i)
292       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
293   __syncthreads();
294 }
295 
296 //------------------------------------------------------------------------------
297 // 2D transpose tensor contract and add x
298 //------------------------------------------------------------------------------
299 template <int NCOMP, int P1d, int Q1d>
300 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
301   data.slice[data.tidx+data.tidy*Q1d] = *U;
302   __syncthreads();
303   if (data.tidx < P1d)
304     for (CeedInt i = 0; i < Q1d; ++i)
305       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
306   __syncthreads();
307 }
308 
309 //------------------------------------------------------------------------------
310 // 2D interpolate to quadrature points
311 //------------------------------------------------------------------------------
312 template <int NCOMP, int P1d, int Q1d>
313 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
314   CeedScalar r_t[1];
315   for (CeedInt comp = 0; comp < NCOMP; comp++) {
316     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
317     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
318   }
319 }
320 
321 //------------------------------------------------------------------------------
322 // 2D interpolate transpose
323 //------------------------------------------------------------------------------
324 template <int NCOMP, int P1d, int Q1d>
325 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
326   CeedScalar r_t[1];
327   for (CeedInt comp = 0; comp < NCOMP; comp++) {
328     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
329     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
330   }
331 }
332 
333 //------------------------------------------------------------------------------
334 // 2D derivatives at quadrature points
335 //------------------------------------------------------------------------------
336 template <int NCOMP, int P1d, int Q1d>
337 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
338   CeedScalar r_t[1];
339   for (CeedInt comp = 0; comp < NCOMP; comp++) {
340     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
341     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
342     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
343     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
344   }
345 }
346 
347 //------------------------------------------------------------------------------
348 // 2D derivatives transpose
349 //------------------------------------------------------------------------------
350 template <int NCOMP, int P1d, int Q1d>
351 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
352   CeedScalar r_t[1];
353   for (CeedInt comp = 0; comp < NCOMP; comp++) {
354     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
355     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
356     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
357     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
358   }
359 }
360 
361 //------------------------------------------------------------------------------
362 // 3D
363 //------------------------------------------------------------------------------
364 
365 //------------------------------------------------------------------------------
366 // L-vector -> E-vector, offsets provided
367 //------------------------------------------------------------------------------
368 template <int NCOMP, int COMPSTRIDE, int P1d>
369 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
370   if (data.tidx < P1d && data.tidy < P1d)
371     for (CeedInt z = 0; z < P1d; ++z) {
372       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
373       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
374       for (CeedInt comp = 0; comp < NCOMP; ++comp)
375         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
376     }
377 }
378 
379 //------------------------------------------------------------------------------
380 // L-vector -> E-vector, strided
381 //------------------------------------------------------------------------------
382 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
383 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
384   if (data.tidx < P1d && data.tidy < P1d)
385     for (CeedInt z = 0; z < P1d; ++z) {
386       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
387       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
388       for (CeedInt comp = 0; comp < NCOMP; ++comp)
389         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
390     }
391 }
392 
393 //------------------------------------------------------------------------------
394 // E-vector -> Q-vector, offests provided
395 //------------------------------------------------------------------------------
396 template <int NCOMP, int COMPSTRIDE, int Q1d>
397 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
398   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
399   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
400   for (CeedInt comp = 0; comp < NCOMP; ++comp)
401     r_u[comp] = d_u[ind + COMPSTRIDE * comp];
402 }
403 
404 //------------------------------------------------------------------------------
405 // E-vector -> Q-vector, strided
406 //------------------------------------------------------------------------------
407 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
408 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
409   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
410   const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
411   for (CeedInt comp = 0; comp < NCOMP; ++comp)
412     r_u[comp] = d_u[ind + comp * STRIDES_COMP];
413 }
414 
415 //------------------------------------------------------------------------------
416 // E-vector -> L-vector, offsets provided
417 //------------------------------------------------------------------------------
418 template <int NCOMP, int COMPSTRIDE, int P1d>
419 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
420   if (data.tidx < P1d && data.tidy < P1d)
421     for (CeedInt z = 0; z < P1d; ++z) {
422       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
423       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
424       for (CeedInt comp = 0; comp < NCOMP; ++comp)
425         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
426     }
427 }
428 
429 //------------------------------------------------------------------------------
430 // E-vector -> L-vector, strided
431 //------------------------------------------------------------------------------
432 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
433 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
434   if (data.tidx < P1d && data.tidy < P1d)
435     for (CeedInt z = 0; z < P1d; ++z) {
436       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
437       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
438       for (CeedInt comp = 0; comp < NCOMP; ++comp)
439         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
440     }
441 }
442 
443 //------------------------------------------------------------------------------
444 // 3D tensor contract x
445 //------------------------------------------------------------------------------
446 template <int NCOMP, int P1d, int Q1d>
447 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
448   CeedScalar r_B[P1d];
449   for (CeedInt i = 0; i < P1d; ++i)
450     r_B[i] = B[i + data.tidx*P1d];
451 
452   for (CeedInt k = 0; k < P1d; ++k) {
453     data.slice[data.tidx+data.tidy*Q1d] = U[k];
454     __syncthreads();
455     V[k] = 0.0;
456     for (CeedInt i = 0; i < P1d; ++i)
457       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
458     __syncthreads();
459   }
460 }
461 
462 //------------------------------------------------------------------------------
463 // 3D tensor contract y
464 //------------------------------------------------------------------------------
465 template <int NCOMP, int P1d, int Q1d>
466 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
467   CeedScalar r_B[P1d];
468   for (CeedInt i = 0; i < P1d; ++i)
469     r_B[i] = B[i + data.tidy*P1d];
470 
471   for (CeedInt k = 0; k < P1d; ++k) {
472     data.slice[data.tidx+data.tidy*Q1d] = U[k];
473     __syncthreads();
474     V[k] = 0.0;
475     for (CeedInt i = 0; i < P1d; ++i)
476       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
477     __syncthreads();
478   }
479 }
480 
481 //------------------------------------------------------------------------------
482 // 3D tensor contract z
483 //------------------------------------------------------------------------------
484 template <int NCOMP, int P1d, int Q1d>
485 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
486   for (CeedInt k = 0; k < Q1d; ++k) {
487     V[k] = 0.0;
488     for (CeedInt i = 0; i < P1d; ++i)
489       V[k] += B[i + k*P1d] * U[i]; // Contract z direction
490   }
491 }
492 
493 //------------------------------------------------------------------------------
494 // 3D transpose tensor contract z
495 //------------------------------------------------------------------------------
496 template <int NCOMP, int P1d, int Q1d>
497 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
498   for (CeedInt k = 0; k < Q1d; ++k) {
499     V[k] = 0.0;
500     if (k < P1d)
501       for (CeedInt i = 0; i < Q1d; ++i)
502         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
503   }
504 }
505 
506 //------------------------------------------------------------------------------
507 // 3D transpose tensor contract y
508 //------------------------------------------------------------------------------
509 template <int NCOMP, int P1d, int Q1d>
510 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
511   CeedScalar r_B[Q1d];
512   for (CeedInt i = 0; i < Q1d; ++i)
513     r_B[i] = B[data.tidy + i*P1d];
514 
515   for (CeedInt k = 0; k < P1d; ++k) {
516     data.slice[data.tidx+data.tidy*Q1d] = U[k];
517     __syncthreads();
518     V[k] = 0.0;
519     if (data.tidy < P1d)
520       for (CeedInt i = 0; i < Q1d; ++i)
521         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
522     __syncthreads();
523   }
524 }
525 
526 //------------------------------------------------------------------------------
527 // 3D transpose tensor contract add y
528 //------------------------------------------------------------------------------
529 template <int NCOMP, int P1d, int Q1d>
530 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
531   CeedScalar r_B[Q1d];
532   for (CeedInt i = 0; i < Q1d; ++i)
533     r_B[i] = B[data.tidy + i*P1d];
534 
535   for (CeedInt k = 0; k < P1d; ++k) {
536     data.slice[data.tidx+data.tidy*Q1d] = U[k];
537     __syncthreads();
538     if (data.tidy < P1d)
539       for (CeedInt i = 0; i < Q1d; ++i)
540         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
541     __syncthreads();
542   }
543 }
544 
545 //------------------------------------------------------------------------------
546 // 3D transpose tensor contract x
547 //------------------------------------------------------------------------------
548 template <int NCOMP, int P1d, int Q1d>
549 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
550   CeedScalar r_B[Q1d];
551   for (CeedInt i = 0; i < Q1d; ++i)
552     r_B[i] = B[data.tidx + i*P1d];
553 
554   for (CeedInt k = 0; k < P1d; ++k) {
555     data.slice[data.tidx+data.tidy*Q1d] = U[k];
556     __syncthreads();
557     V[k] = 0.0;
558     if (data.tidx < P1d)
559       for (CeedInt i = 0; i < Q1d; ++i)
560         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
561     __syncthreads();
562   }
563 }
564 
565 //------------------------------------------------------------------------------
566 // 3D transpose tensor contract add x
567 //------------------------------------------------------------------------------
568 template <int NCOMP, int P1d, int Q1d>
569 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
570   CeedScalar r_B[Q1d];
571   for (CeedInt i = 0; i < Q1d; ++i)
572     r_B[i] = B[data.tidx + i*P1d];
573 
574   for (CeedInt k = 0; k < P1d; ++k) {
575     data.slice[data.tidx+data.tidy*Q1d] = U[k];
576     __syncthreads();
577     if (data.tidx < P1d)
578       for (CeedInt i = 0; i < Q1d; ++i)
579         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
580     __syncthreads();
581   }
582 }
583 
584 //------------------------------------------------------------------------------
585 // 3D interpolate to quadrature points
586 //------------------------------------------------------------------------------
587 template <int NCOMP, int P1d, int Q1d>
588 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
589   CeedScalar r_t1[Q1d];
590   CeedScalar r_t2[Q1d];
591   for (CeedInt comp = 0; comp < NCOMP; comp++) {
592     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
593     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
594     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
595   }
596 }
597 
598 //------------------------------------------------------------------------------
599 // 3D interpolate transpose
600 //------------------------------------------------------------------------------
601 template <int NCOMP, int P1d, int Q1d>
602 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
603   CeedScalar r_t1[Q1d];
604   CeedScalar r_t2[Q1d];
605   for (CeedInt comp = 0; comp < NCOMP; comp++) {
606     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
607     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
608     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
609   }
610 }
611 
612 //------------------------------------------------------------------------------
613 // 3D derivatives at quadrature points
614 //------------------------------------------------------------------------------
615 template <int NCOMP, int P1d, int Q1d>
616 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
617   CeedScalar r_t1[Q1d];
618   CeedScalar r_t2[Q1d];
619   for (CeedInt comp = 0; comp < NCOMP; comp++) {
620     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
621     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
622     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
623     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
624     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
625     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
626     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
627     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
628     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
629   }
630 }
631 
632 //------------------------------------------------------------------------------
633 // 3D derivatives transpose
634 //------------------------------------------------------------------------------
635 template <int NCOMP, int P1d, int Q1d>
636 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
637   CeedScalar r_t1[Q1d];
638   CeedScalar r_t2[Q1d];
639   for (CeedInt comp = 0; comp < NCOMP; comp++) {
640     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
641     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
642     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
643     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
644     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
645     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
646     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
647     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
648     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
649   }
650 }
651 
652 //------------------------------------------------------------------------------
653 // 3D collocated derivatives computation
654 //------------------------------------------------------------------------------
655 template <int NCOMP, int Q1d>
656 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
657   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
658     data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d];
659     __syncthreads();
660     // X derivative
661     r_V[comp+0*NCOMP] = 0.0;
662     for (CeedInt i = 0; i < Q1d; ++i)
663       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
664     // Y derivative
665     r_V[comp+1*NCOMP] = 0.0;
666     for (CeedInt i = 0; i < Q1d; ++i)
667       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
668     // Z derivative
669     r_V[comp+2*NCOMP] = 0.0;
670     for (CeedInt i = 0; i < Q1d; ++i)
671       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
672     __syncthreads();
673   }
674 }
675 
676 //------------------------------------------------------------------------------
677 // 3D collocated derivatives transpose
678 //------------------------------------------------------------------------------
679 template <int NCOMP, int Q1d>
680 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
681   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
682     // X derivative
683     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP];
684     __syncthreads();
685     for (CeedInt i = 0; i < Q1d; ++i)
686       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
687     __syncthreads();
688     // Y derivative
689     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP];
690     __syncthreads();
691     for (CeedInt i = 0; i < Q1d; ++i)
692       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
693     __syncthreads();
694     // Z derivative
695     for (CeedInt i = 0; i < Q1d; ++i)
696       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
697   }
698 }
699 
700 //------------------------------------------------------------------------------
701 // 1D quadrature weights
702 //------------------------------------------------------------------------------
703 template <int Q1d>
704 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
705   *w = qweight1d[data.tidx];
706 }
707 
708 //------------------------------------------------------------------------------
709 // 2D quadrature weights
710 //------------------------------------------------------------------------------
711 template <int Q1d>
712 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
713   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
714 }
715 
716 //------------------------------------------------------------------------------
717 // 3D quadrature weights
718 //------------------------------------------------------------------------------
719 template <int Q1d>
720 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
721   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
722   for (CeedInt z = 0; z < Q1d; ++z)
723     w[z] = pw*qweight1d[z];
724 }
725 
726 );
727 //------------------------------------------------------------------------------
728 // Build singe operator kernel
729 //------------------------------------------------------------------------------
730 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
731 
732   using std::ostringstream;
733   using std::string;
734   int ierr;
735   bool setupdone;
736   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
737   if (setupdone) return 0;
738   Ceed ceed;
739   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
740   CeedOperator_Cuda_gen *data;
741   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
742   CeedQFunction qf;
743   CeedQFunction_Cuda_gen *qf_data;
744   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
745   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
746   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
747           numoutputfields, ncomp, dim = 0, lsize;
748   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
749   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
750   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
751   CeedChk(ierr);
752   CeedOperatorField *opinputfields, *opoutputfields;
753   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
754   CeedChk(ierr);
755   CeedQFunctionField *qfinputfields, *qfoutputfields;
756   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
757   CeedChk(ierr);
758   CeedEvalMode emode;
759   CeedBasis basis;
760   CeedBasis_Cuda_shared *basis_data;
761   CeedElemRestriction Erestrict;
762   CeedElemRestriction_Cuda_reg *restr_data;
763 
764   ostringstream code;
765   string devFunctions(deviceFunctions);
766 
767   // Add atomicAdd function for old NVidia architectures
768   struct cudaDeviceProp prop;
769   Ceed_Cuda *ceed_data;
770   ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr);
771   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
772   if (prop.major<6){
773     code << atomicAdd;
774   }
775 
776   code << devFunctions;
777 
778   string qFunction(qf_data->qFunctionSource);
779   string qFunctionName(qf_data->qFunctionName);
780   string oper;
781   oper = "kernel_" + qFunctionName;
782 
783   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
784   code << "\n#define CeedPragmaSIMD\n";
785 
786   // Find dim and Q1d
787   bool collograd = false;
788   for (CeedInt i = 0; i < numinputfields; i++) {
789     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
790     if (basis != CEED_BASIS_COLLOCATED) {
791       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
792 
793       // Check for collocated gradient
794       if (basis_data->d_collograd1d)
795         collograd = true;
796 
797       // Collect dim and Q1d
798       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
799       bool isTensor;
800       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
801       if (isTensor) {
802         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
803       } else {
804         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
805         }
806     }
807   }
808   // Check output bases for Q1d, dim as well
809   //   The only imput basis might be CEED_BASIS_COLLOCATED
810   for (CeedInt i = 0; i < numoutputfields; i++) {
811     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
812     if (basis != CEED_BASIS_COLLOCATED) {
813       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
814       // Collect dim and Q1d
815       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
816       bool isTensor;
817       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
818       if (isTensor) {
819         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
820       } else {
821         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
822         }
823     }
824   }
825   data->dim = dim;
826   data->Q1d = Q1d;
827 
828   // Define CEED_Q_VLA
829   if (dim != 3 || collograd) {
830     code << "\n#define CEED_Q_VLA 1\n\n";
831   } else {
832     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
833   }
834 
835   code << qFunction;
836 
837   // Setup
838   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
839   // Input Evecs and Restriction
840   for (CeedInt i = 0; i < numinputfields; i++) {
841     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
842     CeedChk(ierr);
843     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
844       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
845     }
846   }
847 
848   for (CeedInt i = 0; i < numoutputfields; i++) {
849     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
850   }
851 
852   code << "const CeedInt Dim = "<<dim<<";\n";
853   code << "const CeedInt Q1d = "<<Q1d<<";\n";
854 
855   code << "extern __shared__ CeedScalar slice[];\n";
856   code << "BackendData data;\n";
857   code << "data.tidx = threadIdx.x;\n";
858   code << "data.tidy = threadIdx.y;\n";
859   code << "data.tidz = threadIdx.z;\n";
860   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
861   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
862 
863   code << "\n// Input field constants and basis data\n";
864   //Initialize constants, and matrices B and G
865   for (CeedInt i = 0; i < numinputfields; i++) {
866     code << "// -- Input field "<<i<<" --\n";
867     // Get elemsize, emode, ncomp
868     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
869     CeedChk(ierr);
870     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
871     CeedChk(ierr);
872     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
873     CeedChk(ierr);
874     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
875     CeedChk(ierr);
876 
877     // Set field constants
878     if (emode != CEED_EVAL_WEIGHT) {
879       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
880       if (basis != CEED_BASIS_COLLOCATED) {
881         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
882         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
883       } else {
884         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
885       }
886       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
887     }
888 
889     // Load basis data
890     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
891     switch (emode) {
892     case CEED_EVAL_NONE:
893       break;
894     case CEED_EVAL_INTERP:
895       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
896       data->B.in[i] = basis_data->d_interp1d;
897       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
898       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
899       break;
900     case CEED_EVAL_GRAD:
901       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
902       data->B.in[i] = basis_data->d_interp1d;
903       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
904       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
905       if (basis_data->d_collograd1d) {
906         data->G.in[i] = basis_data->d_collograd1d;
907         code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
908         code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
909       } else {
910         data->G.in[i] = basis_data->d_grad1d;
911         code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
912         code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
913       }
914       break;
915     case CEED_EVAL_WEIGHT:
916       break; // No action
917     case CEED_EVAL_DIV:
918       break; // TODO: Not implemented
919     case CEED_EVAL_CURL:
920       break; // TODO: Not implemented
921     }
922   }
923 
924   code << "\n// Output field constants and basis data\n";
925   for (CeedInt i = 0; i < numoutputfields; i++) {
926     code << "// -- Output field "<<i<<" --\n";
927     // Get elemsize, emode, ncomp
928     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
929     CeedChk(ierr);
930     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
931     CeedChk(ierr);
932     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
933     CeedChk(ierr);
934     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
935     CeedChk(ierr);
936 
937     // Set field constants
938     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
939     if (basis != CEED_BASIS_COLLOCATED) {
940       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
941       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
942     } else {
943       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
944     }
945     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
946 
947     // Load basis data
948     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
949     switch (emode) {
950     case CEED_EVAL_NONE:
951       break; // No action
952     case CEED_EVAL_INTERP:
953       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
954       data->B.out[i] = basis_data->d_interp1d;
955       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
956       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
957       break;
958     case CEED_EVAL_GRAD:
959       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
960       data->B.out[i] = basis_data->d_interp1d;
961       code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
962       code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
963       if (basis_data->d_collograd1d) {
964         data->G.out[i] = basis_data->d_collograd1d;
965         code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
966         code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
967       } else {
968         data->G.out[i] = basis_data->d_grad1d;
969         code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
970         code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
971       }
972       break;
973     case CEED_EVAL_WEIGHT: {
974       Ceed ceed;
975       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
976       return CeedError(ceed, 1,
977                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
978       break; // Should not occur
979     }
980     case CEED_EVAL_DIV:
981       break; // TODO: Not implemented
982     case CEED_EVAL_CURL:
983       break; // TODO: Not implemented
984     }
985   }
986   code << "\n";
987   code << "__syncthreads();\n";
988   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
989   // Input basis apply if needed
990   // Generate the correct eval mode code for each input
991   code << "\n// Input field restrictions and basis actions\n";
992   for (CeedInt i = 0; i < numinputfields; i++) {
993     code << "  // -- Input field "<<i<<" --\n";
994     // Get elemsize, emode, ncomp
995     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
996     CeedChk(ierr);
997     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
998     CeedChk(ierr);
999     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1000     CeedChk(ierr);
1001     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1002     CeedChk(ierr);
1003 
1004     // Restriction
1005     if (emode != CEED_EVAL_WEIGHT &&
1006         !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) {
1007       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1008 
1009       bool isStrided;
1010       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1011       if (!isStrided) {
1012         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1013         CeedChk(ierr);
1014         code << "  const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1015         CeedInt compstride;
1016         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1017         code << "  // CompStride: "<<compstride<<"\n";
1018         ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1019         data->indices.in[i] = restr_data->d_ind;
1020         code << "  readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1021       } else {
1022         bool backendstrides;
1023         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1024         CeedChk(ierr);
1025         CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1026         if (!backendstrides) {
1027           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1028           CeedChk(ierr);
1029         }
1030         code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1031         code << "  readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1032       }
1033     }
1034 
1035     // Basis action
1036     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1037     switch (emode) {
1038     case CEED_EVAL_NONE:
1039       if (!basis_data->d_collograd1d) {
1040         code << "  CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1041       }
1042       break;
1043     case CEED_EVAL_INTERP:
1044       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1045       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1046       break;
1047     case CEED_EVAL_GRAD:
1048       if (basis_data->d_collograd1d) {
1049         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1050         code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1051       } else {
1052         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1053         code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1054       }
1055       break;
1056     case CEED_EVAL_WEIGHT:
1057       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
1058       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1059       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1060       data->W = basis_data->d_qweight1d;
1061       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1062       break; // No action
1063     case CEED_EVAL_DIV:
1064       break; // TODO: Not implemented
1065     case CEED_EVAL_CURL:
1066       break; // TODO: Not implemented
1067     }
1068   }
1069 
1070   // Q function
1071   code << "\n// Output field setup\n";
1072   for (CeedInt i = 0; i < numoutputfields; i++) {
1073       code << "  // -- Output field "<<i<<" --\n";
1074     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1075     CeedChk(ierr);
1076     if (emode==CEED_EVAL_GRAD)
1077     {
1078       if (basis_data->d_collograd1d) {
1079         //Accumulator for gradient slices
1080         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1081         code << "  for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1082         code << "    for (CeedInt j = 0; j < Q1d; ++j) {\n";
1083         code << "      r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1084         code << "    }\n";
1085         code << "  }\n";
1086       } else {
1087         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1088       }
1089     }
1090     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1091     {
1092       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1093     }
1094   }
1095   // We treat quadrature points per slice in 3d to save registers
1096   if (basis_data->d_collograd1d) {
1097     code << "\n  // Note: Collocated Gradient\n";
1098     code << "#pragma unroll\n";
1099     code << "for (CeedInt q=0; q<Q1d; q++) {\n";
1100     for (CeedInt i = 0; i < numinputfields; i++) {
1101       code << "  // -- Input field "<<i<<" --\n";
1102       // Get elemsize, emode, ncomp
1103       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1104       CeedChk(ierr);
1105       // Basis action
1106       code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1107       switch (emode) {
1108       case CEED_EVAL_NONE:
1109         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1110 
1111         bool isStrided;
1112         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1113         if (!isStrided) {
1114           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1115           CeedChk(ierr);
1116           code << "  const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1117           CeedInt compstride;
1118           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1119           code << "  // CompStride: "<<compstride<<"\n";
1120           ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1121           data->indices.in[i] = restr_data->d_ind;
1122           code << "  readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1123         } else {
1124           bool backendstrides;
1125           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1126           CeedChk(ierr);
1127           CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1128           if (!backendstrides) {
1129             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1130             CeedChk(ierr);
1131           }
1132           code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1133           code << "  readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1134         }
1135         break;
1136       case CEED_EVAL_INTERP:
1137         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1138         code << "  for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1139         code << "    r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1140         code << "  }\n";
1141         break;
1142       case CEED_EVAL_GRAD:
1143         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1144         code << "  gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1145         break;
1146       case CEED_EVAL_WEIGHT:
1147         code << "  CeedScalar r_q"<<i<<"[1];\n";
1148         code << "  r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1149         break; // No action
1150       case CEED_EVAL_DIV:
1151         break; // TODO: Not implemented
1152       case CEED_EVAL_CURL:
1153         break; // TODO: Not implemented
1154       }
1155     }
1156     for (CeedInt i = 0; i < numoutputfields; i++) {
1157       code << "  // -- Output field "<<i<<" --\n";
1158       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1159       CeedChk(ierr);
1160       // Basis action
1161       switch (emode) {
1162       case CEED_EVAL_NONE:
1163         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1164         break; // No action
1165       case CEED_EVAL_INTERP:
1166         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1167         break;
1168       case CEED_EVAL_GRAD:
1169         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1170         break;
1171       case CEED_EVAL_WEIGHT:
1172         break; // Should not occur
1173       case CEED_EVAL_DIV:
1174         break; // TODO: Not implemented
1175       case CEED_EVAL_CURL:
1176         break; // TODO: Not implemented
1177       }
1178     }
1179   } else {
1180       code << "\n  // Note: No Collocated Gradient\n";
1181     for (CeedInt i = 0; i < numinputfields; i++) {
1182       code << "  // -- Input field "<<i<<" --\n";
1183       code << "  CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1184     }
1185     for (CeedInt i = 0; i < numoutputfields; i++) {
1186       code << "  // -- Output field "<<i<<" --\n";
1187       code << "  CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1188     }
1189   }
1190   code << "  // QFunction Inputs and outputs\n";
1191   code << "  CeedScalar* in["<<numinputfields<<"];\n";
1192   for (CeedInt i = 0; i < numinputfields; i++) {
1193     code << "  // -- Input field "<<i<<" --\n";
1194     code << "  in["<<i<<"] = r_q"<<i<<";\n";
1195   }
1196   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
1197   for (CeedInt i = 0; i < numoutputfields; i++) {
1198     code << "  // -- Output field "<<i<<" --\n";
1199     code << "  out["<<i<<"] = r_qq"<<i<<";\n";
1200   }
1201   code << "\n  // Apply QFunction\n";
1202   code << "  "<<qFunctionName<<"(ctx, ";
1203   if (dim != 3 || basis_data->d_collograd1d) {
1204     code << "1 ";
1205   } else {
1206     code << "Q1d ";
1207   }
1208   code << ", in, out);\n";
1209   if (basis_data->d_collograd1d) {
1210     code << "\n  // Note: Collocated Gradient\n";
1211     for (CeedInt i = 0; i < numoutputfields; i++) {
1212       code << "  // -- Output field "<<i<<" --\n";
1213       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1214       CeedChk(ierr);
1215       // Basis action
1216       code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1217       switch (emode) {
1218       case CEED_EVAL_NONE:
1219         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1220         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1221         code << "  }\n";
1222         break; // No action
1223       case CEED_EVAL_INTERP:
1224         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1225         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1226         code << "  }\n";
1227         break;
1228       case CEED_EVAL_GRAD:
1229         code << "  gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1230         break;
1231       case CEED_EVAL_WEIGHT:
1232         break; // Should not occur
1233       case CEED_EVAL_DIV:
1234         break; // TODO: Not implemented
1235       case CEED_EVAL_CURL:
1236         break; // TODO: Not implemented
1237       }
1238     }
1239     code << "}\n";
1240   }
1241 
1242   // Output basis apply if needed
1243   // Generate the correct eval mode code for each output
1244   code << "\n// Output field basis action and restrictions\n";
1245   for (CeedInt i = 0; i < numoutputfields; i++) {
1246     code << "  // -- Output field "<<i<<" --\n";
1247     // Get elemsize, emode, ncomp
1248     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1249     CeedChk(ierr);
1250     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1251     CeedChk(ierr);
1252     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1253     CeedChk(ierr);
1254     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1255     CeedChk(ierr);
1256     // Basis action
1257     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1258     switch (emode) {
1259     case CEED_EVAL_NONE:
1260       code << "  CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1261       break; // No action
1262     case CEED_EVAL_INTERP:
1263       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1264       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1265       break;
1266     case CEED_EVAL_GRAD:
1267       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1268       if (basis_data->d_collograd1d) {
1269         code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1270       } else {
1271         code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1272       }
1273       break;
1274     case CEED_EVAL_WEIGHT: {
1275       Ceed ceed;
1276       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1277       return CeedError(ceed, 1,
1278                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1279       break; // Should not occur
1280     }
1281     case CEED_EVAL_DIV:
1282       break; // TODO: Not implemented
1283     case CEED_EVAL_CURL:
1284       break; // TODO: Not implemented
1285     }
1286     // Restriction
1287       bool isStrided;
1288       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1289     if (!isStrided) {
1290       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1291       CeedChk(ierr);
1292       code << "  const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1293       CeedInt compstride;
1294       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1295       code << "  // CompStride: "<<compstride<<"\n";
1296       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1297       data->indices.out[i] = restr_data->d_ind;
1298       code << "  writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1299     } else {
1300       bool backendstrides;
1301       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1302       CeedChk(ierr);
1303       CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1304       if (!backendstrides) {
1305         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1306         CeedChk(ierr);
1307       }
1308       code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1309       code << "  writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1310     }
1311   }
1312 
1313   code << "  }\n";
1314   code << "}\n\n";
1315 
1316   // View kernel for debugging
1317   // std::cout << code.str();
1318 
1319   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0);
1320   CeedChk(ierr);
1321   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1322   CeedChk(ierr);
1323 
1324   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1325   return 0;
1326 }
1327 //------------------------------------------------------------------------------
1328