xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 7df9421200be70ca883fe45ddd0142905dd79820)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include "ceed-cuda-gen.h"
18 #include <iostream>
19 #include <sstream>
20 #include "../cuda-reg/ceed-cuda-reg.h"
21 #include "../cuda-shared/ceed-cuda-shared.h"
22 
23 static const char *atomicAdd = QUOTE(
24 //------------------------------------------------------------------------------
25 // Atomic add, for older CUDA
26 //------------------------------------------------------------------------------
27 __device__ double atomicAdd(double *address, double val) {
28   unsigned long long int *address_as_ull = (unsigned long long int *)address;
29   unsigned long long int old = *address_as_ull, assumed;
30   do {
31     assumed = old;
32     old =
33       atomicCAS(address_as_ull, assumed,
34                 __double_as_longlong(val +
35                                      __longlong_as_double(assumed)));
36     // Note: uses integer comparison to avoid hang in case of NaN
37     // (since NaN != NaN)
38   } while (assumed != old);
39   return __longlong_as_double(old);
40 }
41 );
42 
43 static const char *deviceFunctions = QUOTE(
44 
45 //------------------------------------------------------------------------------
46 // Typedefs
47 //------------------------------------------------------------------------------
48 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
49 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
50 
51 typedef struct {
52   CeedInt tidx;
53   CeedInt tidy;
54   CeedInt tidz;
55   CeedInt tid;
56   CeedScalar* slice;
57 } BackendData;
58 
59 //------------------------------------------------------------------------------
60 // Load matrices for basis actions
61 //------------------------------------------------------------------------------
62 template <int P, int Q>
63 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
64   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
65     B[i] = d_B[i];
66 }
67 
68 //------------------------------------------------------------------------------
69 // 1D
70 //------------------------------------------------------------------------------
71 
72 //------------------------------------------------------------------------------
73 // L-vector -> E-vector, offsets provided
74 //------------------------------------------------------------------------------
75 template <int NCOMP, int COMPSTRIDE, int P1d>
76 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
77   if (data.tidx < P1d) {
78     const CeedInt node = data.tidx;
79     const CeedInt ind = indices[node + elem * P1d];
80     for (CeedInt comp = 0; comp < NCOMP; ++comp)
81       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
82   }
83 }
84 
85 //------------------------------------------------------------------------------
86 // L-vector -> E-vector, strided
87 //------------------------------------------------------------------------------
88 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
89 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
90   if (data.tidx < P1d) {
91     const CeedInt node = data.tidx;
92     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
93     for (CeedInt comp = 0; comp < NCOMP; ++comp)
94       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
95   }
96 }
97 
98 //------------------------------------------------------------------------------
99 // E-vector -> L-vector, offsets provided
100 //------------------------------------------------------------------------------
101 template <int NCOMP, int COMPSTRIDE, int P1d>
102 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
103   if (data.tidx < P1d) {
104     const CeedInt node = data.tidx;
105     const CeedInt ind = indices[node + elem * P1d];
106     for (CeedInt comp = 0; comp < NCOMP; ++comp)
107       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
108   }
109 }
110 
111 //------------------------------------------------------------------------------
112 // E-vector -> L-vector, strided
113 //------------------------------------------------------------------------------
114 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
115 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
116   if (data.tidx < P1d) {
117     const CeedInt node = data.tidx;
118     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
119     for (CeedInt comp = 0; comp < NCOMP; ++comp)
120       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
121   }
122 }
123 
124 //------------------------------------------------------------------------------
125 // 1D tensor contraction x
126 //------------------------------------------------------------------------------
127 template <int NCOMP, int P1d, int Q1d>
128 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
129   data.slice[data.tidx] = *U;
130   __syncthreads();
131   *V = 0.0;
132   for (CeedInt i = 0; i < P1d; ++i)
133     *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
134   __syncthreads();
135 }
136 
137 //------------------------------------------------------------------------------
138 // 1D transpose tensor contraction x
139 //------------------------------------------------------------------------------
140 template <int NCOMP, int P1d, int Q1d>
141 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
142   data.slice[data.tidx] = *U;
143   __syncthreads();
144   *V = 0.0;
145   for (CeedInt i = 0; i < Q1d; ++i)
146     *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
147   __syncthreads();
148 }
149 
150 //------------------------------------------------------------------------------
151 // 1D interpolate to quadrature points
152 //------------------------------------------------------------------------------
153 template <int NCOMP, int P1d, int Q1d>
154 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
155   for (CeedInt comp = 0; comp < NCOMP; comp++)
156     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
157 }
158 
159 //------------------------------------------------------------------------------
160 // 1D interpolate transpose
161 //------------------------------------------------------------------------------
162 template <int NCOMP, int P1d, int Q1d>
163 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
164   for (CeedInt comp=0; comp<NCOMP; comp++)
165     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
166 }
167 
168 //------------------------------------------------------------------------------
169 // 1D derivatives at quadrature points
170 //------------------------------------------------------------------------------
171 template <int NCOMP, int P1d, int Q1d>
172 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
173   for (CeedInt comp = 0; comp < NCOMP; comp++)
174     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
175 }
176 
177 //------------------------------------------------------------------------------
178 // 1D derivatives transpose
179 //------------------------------------------------------------------------------
180 template <int NCOMP, int P1d, int Q1d>
181 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
182   for (CeedInt comp = 0; comp < NCOMP; comp++)
183     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
184 }
185 
186 //------------------------------------------------------------------------------
187 // 2D
188 //------------------------------------------------------------------------------
189 
190 //------------------------------------------------------------------------------
191 // L-vector -> E-vector, offsets provided
192 //------------------------------------------------------------------------------
193 template <int NCOMP, int COMPSTRIDE, int P1d>
194 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
195   if (data.tidx < P1d && data.tidy < P1d) {
196     const CeedInt node = data.tidx + data.tidy*P1d;
197     const CeedInt ind = indices[node + elem * P1d*P1d];
198     for (CeedInt comp = 0; comp < NCOMP; ++comp)
199       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
200   }
201 }
202 
203 //------------------------------------------------------------------------------
204 // L-vector -> E-vector, strided
205 //------------------------------------------------------------------------------
206 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
207 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
208   if (data.tidx < P1d && data.tidy < P1d) {
209     const CeedInt node = data.tidx + data.tidy*P1d;
210     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
211     for (CeedInt comp = 0; comp < NCOMP; ++comp)
212       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
213   }
214 }
215 
216 //------------------------------------------------------------------------------
217 // E-vector -> L-vector, offsets provided
218 //------------------------------------------------------------------------------
219 template <int NCOMP, int COMPSTRIDE, int P1d>
220 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
221   if (data.tidx < P1d && data.tidy < P1d) {
222     const CeedInt node = data.tidx + data.tidy*P1d;
223     const CeedInt ind = indices[node + elem * P1d*P1d];
224     for (CeedInt comp = 0; comp < NCOMP; ++comp)
225       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
226   }
227 }
228 
229 //------------------------------------------------------------------------------
230 // E-vector -> L-vector, strided
231 //------------------------------------------------------------------------------
232 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
233 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
234   if (data.tidx < P1d && data.tidy < P1d) {
235     const CeedInt node = data.tidx + data.tidy*P1d;
236     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
237     for (CeedInt comp = 0; comp < NCOMP; ++comp)
238       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
239   }
240 }
241 
242 //------------------------------------------------------------------------------
243 // 2D tensor contraction x
244 //------------------------------------------------------------------------------
245 template <int NCOMP, int P1d, int Q1d>
246 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
247   data.slice[data.tidx+data.tidy*Q1d] = *U;
248   __syncthreads();
249   *V = 0.0;
250   for (CeedInt i = 0; i < P1d; ++i)
251     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
252   __syncthreads();
253 }
254 
255 //------------------------------------------------------------------------------
256 // 2D tensor contract y
257 //------------------------------------------------------------------------------
258 template <int NCOMP, int P1d, int Q1d>
259 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
260   data.slice[data.tidx+data.tidy*Q1d] = *U;
261   __syncthreads();
262   *V = 0.0;
263   for (CeedInt i = 0; i < P1d; ++i)
264     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
265   __syncthreads();
266 }
267 
268 //------------------------------------------------------------------------------
269 // 2D transpose tensor contract y
270 //------------------------------------------------------------------------------
271 template <int NCOMP, int P1d, int Q1d>
272 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
273   data.slice[data.tidx+data.tidy*Q1d] = *U;
274   __syncthreads();
275   *V = 0.0;
276   if (data.tidy < P1d)
277     for (CeedInt i = 0; i < Q1d; ++i)
278       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
279   __syncthreads();
280 }
281 
282 //------------------------------------------------------------------------------
283 // 2D transpose tensor contract x
284 //------------------------------------------------------------------------------
285 template <int NCOMP, int P1d, int Q1d>
286 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
287   data.slice[data.tidx+data.tidy*Q1d] = *U;
288   __syncthreads();
289   *V = 0.0;
290   if (data.tidx < P1d)
291     for (CeedInt i = 0; i < Q1d; ++i)
292       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
293   __syncthreads();
294 }
295 
296 //------------------------------------------------------------------------------
297 // 2D transpose tensor contract and add x
298 //------------------------------------------------------------------------------
299 template <int NCOMP, int P1d, int Q1d>
300 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
301   data.slice[data.tidx+data.tidy*Q1d] = *U;
302   __syncthreads();
303   if (data.tidx < P1d)
304     for (CeedInt i = 0; i < Q1d; ++i)
305       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
306   __syncthreads();
307 }
308 
309 //------------------------------------------------------------------------------
310 // 2D interpolate to quadrature points
311 //------------------------------------------------------------------------------
312 template <int NCOMP, int P1d, int Q1d>
313 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
314   CeedScalar r_t[1];
315   for (CeedInt comp = 0; comp < NCOMP; comp++) {
316     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
317     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
318   }
319 }
320 
321 //------------------------------------------------------------------------------
322 // 2D interpolate transpose
323 //------------------------------------------------------------------------------
324 template <int NCOMP, int P1d, int Q1d>
325 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
326   CeedScalar r_t[1];
327   for (CeedInt comp = 0; comp < NCOMP; comp++) {
328     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
329     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
330   }
331 }
332 
333 //------------------------------------------------------------------------------
334 // 2D derivatives at quadrature points
335 //------------------------------------------------------------------------------
336 template <int NCOMP, int P1d, int Q1d>
337 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
338   CeedScalar r_t[1];
339   for (CeedInt comp = 0; comp < NCOMP; comp++) {
340     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
341     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
342     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
343     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
344   }
345 }
346 
347 //------------------------------------------------------------------------------
348 // 2D derivatives transpose
349 //------------------------------------------------------------------------------
350 template <int NCOMP, int P1d, int Q1d>
351 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
352   CeedScalar r_t[1];
353   for (CeedInt comp = 0; comp < NCOMP; comp++) {
354     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
355     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
356     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
357     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
358   }
359 }
360 
361 //------------------------------------------------------------------------------
362 // 3D
363 //------------------------------------------------------------------------------
364 
365 //------------------------------------------------------------------------------
366 // L-vector -> E-vector, offsets provided
367 //------------------------------------------------------------------------------
368 template <int NCOMP, int COMPSTRIDE, int P1d>
369 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
370   if (data.tidx < P1d && data.tidy < P1d)
371     for (CeedInt z = 0; z < P1d; ++z) {
372       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
373       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
374       for (CeedInt comp = 0; comp < NCOMP; ++comp)
375         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
376     }
377 }
378 
379 //------------------------------------------------------------------------------
380 // L-vector -> E-vector, strided
381 //------------------------------------------------------------------------------
382 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
383 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
384   if (data.tidx < P1d && data.tidy < P1d)
385     for (CeedInt z = 0; z < P1d; ++z) {
386       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
387       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
388       for (CeedInt comp = 0; comp < NCOMP; ++comp)
389         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
390     }
391 }
392 
393 //------------------------------------------------------------------------------
394 // E-vector -> Q-vector, offests provided
395 //------------------------------------------------------------------------------
396 template <int NCOMP, int COMPSTRIDE, int Q1d>
397 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
398   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
399   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
400   for (CeedInt comp = 0; comp < NCOMP; ++comp)
401     r_u[comp] = d_u[ind + COMPSTRIDE * comp];
402 }
403 
404 //------------------------------------------------------------------------------
405 // E-vector -> Q-vector, strided
406 //------------------------------------------------------------------------------
407 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
408 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
409   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
410   const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
411   for (CeedInt comp = 0; comp < NCOMP; ++comp)
412     r_u[comp] = d_u[ind + comp * STRIDES_COMP];
413 }
414 
415 //------------------------------------------------------------------------------
416 // E-vector -> L-vector, offsets provided
417 //------------------------------------------------------------------------------
418 template <int NCOMP, int COMPSTRIDE, int P1d>
419 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
420   if (data.tidx < P1d && data.tidy < P1d)
421     for (CeedInt z = 0; z < P1d; ++z) {
422       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
423       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
424       for (CeedInt comp = 0; comp < NCOMP; ++comp)
425         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
426     }
427 }
428 
429 //------------------------------------------------------------------------------
430 // E-vector -> L-vector, strided
431 //------------------------------------------------------------------------------
432 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
433 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
434   if (data.tidx < P1d && data.tidy < P1d)
435     for (CeedInt z = 0; z < P1d; ++z) {
436       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
437       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
438       for (CeedInt comp = 0; comp < NCOMP; ++comp)
439         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
440     }
441 }
442 
443 //------------------------------------------------------------------------------
444 // 3D tensor contract x
445 //------------------------------------------------------------------------------
446 template <int NCOMP, int P1d, int Q1d>
447 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
448   CeedScalar r_B[P1d];
449   for (CeedInt i = 0; i < P1d; ++i)
450     r_B[i] = B[i + data.tidx*P1d];
451 
452   for (CeedInt k = 0; k < P1d; ++k) {
453     data.slice[data.tidx+data.tidy*Q1d] = U[k];
454     __syncthreads();
455     V[k] = 0.0;
456     for (CeedInt i = 0; i < P1d; ++i)
457       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
458     __syncthreads();
459   }
460 }
461 
462 //------------------------------------------------------------------------------
463 // 3D tensor contract y
464 //------------------------------------------------------------------------------
465 template <int NCOMP, int P1d, int Q1d>
466 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
467   CeedScalar r_B[P1d];
468   for (CeedInt i = 0; i < P1d; ++i)
469     r_B[i] = B[i + data.tidy*P1d];
470 
471   for (CeedInt k = 0; k < P1d; ++k) {
472     data.slice[data.tidx+data.tidy*Q1d] = U[k];
473     __syncthreads();
474     V[k] = 0.0;
475     for (CeedInt i = 0; i < P1d; ++i)
476       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
477     __syncthreads();
478   }
479 }
480 
481 //------------------------------------------------------------------------------
482 // 3D tensor contract z
483 //------------------------------------------------------------------------------
484 template <int NCOMP, int P1d, int Q1d>
485 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
486   for (CeedInt k = 0; k < Q1d; ++k) {
487     V[k] = 0.0;
488     for (CeedInt i = 0; i < P1d; ++i)
489       V[k] += B[i + k*P1d] * U[i]; // Contract z direction
490   }
491 }
492 
493 //------------------------------------------------------------------------------
494 // 3D transpose tensor contract z
495 //------------------------------------------------------------------------------
496 template <int NCOMP, int P1d, int Q1d>
497 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
498   for (CeedInt k = 0; k < Q1d; ++k) {
499     V[k] = 0.0;
500     if (k < P1d)
501       for (CeedInt i = 0; i < Q1d; ++i)
502         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
503   }
504 }
505 
506 //------------------------------------------------------------------------------
507 // 3D transpose tensor contract y
508 //------------------------------------------------------------------------------
509 template <int NCOMP, int P1d, int Q1d>
510 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
511   CeedScalar r_B[Q1d];
512   for (CeedInt i = 0; i < Q1d; ++i)
513     r_B[i] = B[data.tidy + i*P1d];
514 
515   for (CeedInt k = 0; k < P1d; ++k) {
516     data.slice[data.tidx+data.tidy*Q1d] = U[k];
517     __syncthreads();
518     V[k] = 0.0;
519     if (data.tidy < P1d)
520       for (CeedInt i = 0; i < Q1d; ++i)
521         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
522     __syncthreads();
523   }
524 }
525 
526 //------------------------------------------------------------------------------
527 // 3D transpose tensor contract add y
528 //------------------------------------------------------------------------------
529 template <int NCOMP, int P1d, int Q1d>
530 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
531   CeedScalar r_B[Q1d];
532   for (CeedInt i = 0; i < Q1d; ++i)
533     r_B[i] = B[data.tidy + i*P1d];
534 
535   for (CeedInt k = 0; k < P1d; ++k) {
536     data.slice[data.tidx+data.tidy*Q1d] = U[k];
537     __syncthreads();
538     if (data.tidy < P1d)
539       for (CeedInt i = 0; i < Q1d; ++i)
540         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
541     __syncthreads();
542   }
543 }
544 
545 //------------------------------------------------------------------------------
546 // 3D transpose tensor contract x
547 //------------------------------------------------------------------------------
548 template <int NCOMP, int P1d, int Q1d>
549 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
550   CeedScalar r_B[Q1d];
551   for (CeedInt i = 0; i < Q1d; ++i)
552     r_B[i] = B[data.tidx + i*P1d];
553 
554   for (CeedInt k = 0; k < P1d; ++k) {
555     data.slice[data.tidx+data.tidy*Q1d] = U[k];
556     __syncthreads();
557     V[k] = 0.0;
558     if (data.tidx < P1d)
559       for (CeedInt i = 0; i < Q1d; ++i)
560         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
561     __syncthreads();
562   }
563 }
564 
565 //------------------------------------------------------------------------------
566 // 3D transpose tensor contract add x
567 //------------------------------------------------------------------------------
568 template <int NCOMP, int P1d, int Q1d>
569 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
570   CeedScalar r_B[Q1d];
571   for (CeedInt i = 0; i < Q1d; ++i)
572     r_B[i] = B[data.tidx + i*P1d];
573 
574   for (CeedInt k = 0; k < P1d; ++k) {
575     data.slice[data.tidx+data.tidy*Q1d] = U[k];
576     __syncthreads();
577     if (data.tidx < P1d)
578       for (CeedInt i = 0; i < Q1d; ++i)
579         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
580     __syncthreads();
581   }
582 }
583 
584 //------------------------------------------------------------------------------
585 // 3D interpolate to quadrature points
586 //------------------------------------------------------------------------------
587 template <int NCOMP, int P1d, int Q1d>
588 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
589   CeedScalar r_t1[Q1d];
590   CeedScalar r_t2[Q1d];
591   for (CeedInt comp = 0; comp < NCOMP; comp++) {
592     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
593     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
594     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
595   }
596 }
597 
598 //------------------------------------------------------------------------------
599 // 3D interpolate transpose
600 //------------------------------------------------------------------------------
601 template <int NCOMP, int P1d, int Q1d>
602 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
603   CeedScalar r_t1[Q1d];
604   CeedScalar r_t2[Q1d];
605   for (CeedInt comp = 0; comp < NCOMP; comp++) {
606     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
607     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
608     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
609   }
610 }
611 
612 //------------------------------------------------------------------------------
613 // 3D derivatives at quadrature points
614 //------------------------------------------------------------------------------
615 template <int NCOMP, int P1d, int Q1d>
616 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
617   CeedScalar r_t1[Q1d];
618   CeedScalar r_t2[Q1d];
619   for (CeedInt comp = 0; comp < NCOMP; comp++) {
620     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
621     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
622     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
623     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
624     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
625     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
626     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
627     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
628     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
629   }
630 }
631 
632 //------------------------------------------------------------------------------
633 // 3D derivatives transpose
634 //------------------------------------------------------------------------------
635 template <int NCOMP, int P1d, int Q1d>
636 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
637   CeedScalar r_t1[Q1d];
638   CeedScalar r_t2[Q1d];
639   for (CeedInt comp = 0; comp < NCOMP; comp++) {
640     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
641     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
642     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
643     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
644     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
645     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
646     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
647     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
648     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
649   }
650 }
651 
652 //------------------------------------------------------------------------------
653 // 3D collocated derivatives computation
654 //------------------------------------------------------------------------------
655 template <int NCOMP, int Q1d>
656 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
657   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
658     data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d];
659     __syncthreads();
660     // X derivative
661     r_V[comp+0*NCOMP] = 0.0;
662     for (CeedInt i = 0; i < Q1d; ++i)
663       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
664     // Y derivative
665     r_V[comp+1*NCOMP] = 0.0;
666     for (CeedInt i = 0; i < Q1d; ++i)
667       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
668     // Z derivative
669     r_V[comp+2*NCOMP] = 0.0;
670     for (CeedInt i = 0; i < Q1d; ++i)
671       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
672     __syncthreads();
673   }
674 }
675 
676 //------------------------------------------------------------------------------
677 // 3D collocated derivatives transpose
678 //------------------------------------------------------------------------------
679 template <int NCOMP, int Q1d>
680 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
681   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
682     // X derivative
683     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP];
684     __syncthreads();
685     for (CeedInt i = 0; i < Q1d; ++i)
686       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
687     __syncthreads();
688     // Y derivative
689     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP];
690     __syncthreads();
691     for (CeedInt i = 0; i < Q1d; ++i)
692       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
693     __syncthreads();
694     // Z derivative
695     for (CeedInt i = 0; i < Q1d; ++i)
696       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
697   }
698 }
699 
700 //------------------------------------------------------------------------------
701 // 1D quadrature weights
702 //------------------------------------------------------------------------------
703 template <int Q1d>
704 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
705   *w = qweight1d[data.tidx];
706 }
707 
708 //------------------------------------------------------------------------------
709 // 2D quadrature weights
710 //------------------------------------------------------------------------------
711 template <int Q1d>
712 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
713   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
714 }
715 
716 //------------------------------------------------------------------------------
717 // 3D quadrature weights
718 //------------------------------------------------------------------------------
719 template <int Q1d>
720 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
721   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
722   for (CeedInt z = 0; z < Q1d; ++z)
723     w[z] = pw*qweight1d[z];
724 }
725 
726 );
727 //------------------------------------------------------------------------------
728 // Build singe operator kernel
729 //------------------------------------------------------------------------------
730 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
731 
732   using std::ostringstream;
733   using std::string;
734   int ierr;
735   bool setupdone;
736   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
737   if (setupdone) return 0;
738   Ceed ceed;
739   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
740   CeedOperator_Cuda_gen *data;
741   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
742   CeedQFunction qf;
743   CeedQFunction_Cuda_gen *qf_data;
744   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
745   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
746   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
747           numoutputfields, ncomp, dim = 0, lsize;
748   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
749   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
750   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
751   CeedChk(ierr);
752   CeedOperatorField *opinputfields, *opoutputfields;
753   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
754   CeedChk(ierr);
755   CeedQFunctionField *qfinputfields, *qfoutputfields;
756   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
757   CeedChk(ierr);
758   CeedEvalMode emode;
759   CeedBasis basis;
760   CeedBasis_Cuda_shared *basis_data;
761   CeedElemRestriction Erestrict;
762   CeedElemRestriction_Cuda_reg *restr_data;
763 
764   ostringstream code;
765   string devFunctions(deviceFunctions);
766 
767   // Add atomicAdd function for old NVidia architectures
768   struct cudaDeviceProp prop;
769   Ceed_Cuda *ceed_data;
770   ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr);
771   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
772   if (prop.major<6){
773     code << atomicAdd;
774   }
775 
776   code << devFunctions;
777 
778   string qFunction(qf_data->qFunctionSource);
779 
780   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
781   code << "\n#define CeedPragmaSIMD\n";
782 
783   // Find dim and Q1d
784   bool collograd = false;
785   for (CeedInt i = 0; i < numinputfields; i++) {
786     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
787     if (basis != CEED_BASIS_COLLOCATED) {
788       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
789 
790       // Check for collocated gradient
791       if (basis_data->d_collograd1d)
792         collograd = true;
793 
794       // Collect dim and Q1d
795       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
796       bool isTensor;
797       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
798       if (isTensor) {
799         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
800       } else {
801         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
802         }
803     }
804   }
805   // Check output bases for Q1d, dim as well
806   //   The only imput basis might be CEED_BASIS_COLLOCATED
807   for (CeedInt i = 0; i < numoutputfields; i++) {
808     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
809     if (basis != CEED_BASIS_COLLOCATED) {
810       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
811       // Collect dim and Q1d
812       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
813       bool isTensor;
814       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
815       if (isTensor) {
816         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
817       } else {
818         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
819         }
820     }
821   }
822   data->dim = dim;
823   data->Q1d = Q1d;
824 
825   // Define CEED_Q_VLA
826   if (dim != 3 || collograd) {
827     code << "\n#define CEED_Q_VLA 1\n\n";
828   } else {
829     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
830   }
831 
832   code << qFunction;
833 
834   // Setup
835   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
836   // Input Evecs and Restriction
837   for (CeedInt i = 0; i < numinputfields; i++) {
838     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
839     CeedChk(ierr);
840     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
841       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
842     }
843   }
844 
845   for (CeedInt i = 0; i < numoutputfields; i++) {
846     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
847   }
848 
849   code << "const CeedInt Dim = "<<dim<<";\n";
850   code << "const CeedInt Q1d = "<<Q1d<<";\n";
851 
852   code << "extern __shared__ CeedScalar slice[];\n";
853   code << "BackendData data;\n";
854   code << "data.tidx = threadIdx.x;\n";
855   code << "data.tidy = threadIdx.y;\n";
856   code << "data.tidz = threadIdx.z;\n";
857   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
858   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
859 
860   code << "\n// Input field constants and basis data\n";
861   //Initialize constants, and matrices B and G
862   for (CeedInt i = 0; i < numinputfields; i++) {
863     code << "// -- Input field "<<i<<" --\n";
864     // Get elemsize, emode, ncomp
865     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
866     CeedChk(ierr);
867     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
868     CeedChk(ierr);
869     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
870     CeedChk(ierr);
871     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
872     CeedChk(ierr);
873 
874     // Set field constants
875     if (emode != CEED_EVAL_WEIGHT) {
876       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
877       if (basis != CEED_BASIS_COLLOCATED) {
878         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
879         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
880       } else {
881         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
882       }
883       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
884     }
885 
886     // Load basis data
887     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
888     switch (emode) {
889     case CEED_EVAL_NONE:
890       break;
891     case CEED_EVAL_INTERP:
892       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
893       data->B.in[i] = basis_data->d_interp1d;
894       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
895       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
896       break;
897     case CEED_EVAL_GRAD:
898       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
899       data->B.in[i] = basis_data->d_interp1d;
900       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
901       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
902       if (basis_data->d_collograd1d) {
903         data->G.in[i] = basis_data->d_collograd1d;
904         code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
905         code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
906       } else {
907         data->G.in[i] = basis_data->d_grad1d;
908         code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
909         code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
910       }
911       break;
912     case CEED_EVAL_WEIGHT:
913       break; // No action
914     case CEED_EVAL_DIV:
915       break; // TODO: Not implemented
916     case CEED_EVAL_CURL:
917       break; // TODO: Not implemented
918     }
919   }
920 
921   code << "\n// Output field constants and basis data\n";
922   for (CeedInt i = 0; i < numoutputfields; i++) {
923     code << "// -- Output field "<<i<<" --\n";
924     // Get elemsize, emode, ncomp
925     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
926     CeedChk(ierr);
927     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
928     CeedChk(ierr);
929     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
930     CeedChk(ierr);
931     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
932     CeedChk(ierr);
933 
934     // Set field constants
935     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
936     if (basis != CEED_BASIS_COLLOCATED) {
937       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
938       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
939     } else {
940       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
941     }
942     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
943 
944     // Load basis data
945     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
946     switch (emode) {
947     case CEED_EVAL_NONE:
948       break; // No action
949     case CEED_EVAL_INTERP:
950       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
951       data->B.out[i] = basis_data->d_interp1d;
952       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
953       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
954       break;
955     case CEED_EVAL_GRAD:
956       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
957       data->B.out[i] = basis_data->d_interp1d;
958       code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
959       code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
960       if (basis_data->d_collograd1d) {
961         data->G.out[i] = basis_data->d_collograd1d;
962         code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
963         code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
964       } else {
965         data->G.out[i] = basis_data->d_grad1d;
966         code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
967         code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
968       }
969       break;
970     case CEED_EVAL_WEIGHT: {
971       Ceed ceed;
972       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
973       return CeedError(ceed, 1,
974                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
975       break; // Should not occur
976     }
977     case CEED_EVAL_DIV:
978       break; // TODO: Not implemented
979     case CEED_EVAL_CURL:
980       break; // TODO: Not implemented
981     }
982   }
983   code << "\n";
984   code << "__syncthreads();\n";
985   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
986   // Input basis apply if needed
987   // Generate the correct eval mode code for each input
988   code << "\n// Input field restrictions and basis actions\n";
989   for (CeedInt i = 0; i < numinputfields; i++) {
990     code << "  // -- Input field "<<i<<" --\n";
991     // Get elemsize, emode, ncomp
992     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
993     CeedChk(ierr);
994     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
995     CeedChk(ierr);
996     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
997     CeedChk(ierr);
998     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
999     CeedChk(ierr);
1000 
1001     // Restriction
1002     if (emode != CEED_EVAL_WEIGHT &&
1003         !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) {
1004       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1005 
1006       bool isStrided;
1007       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1008       if (!isStrided) {
1009         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1010         CeedChk(ierr);
1011         code << "  const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1012         CeedInt compstride;
1013         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1014         code << "  // CompStride: "<<compstride<<"\n";
1015         ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1016         data->indices.in[i] = restr_data->d_ind;
1017         code << "  readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1018       } else {
1019         bool backendstrides;
1020         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1021         CeedChk(ierr);
1022         CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1023         if (!backendstrides) {
1024           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1025           CeedChk(ierr);
1026         }
1027         code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1028         code << "  readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1029       }
1030     }
1031 
1032     // Basis action
1033     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1034     switch (emode) {
1035     case CEED_EVAL_NONE:
1036       if (!basis_data->d_collograd1d) {
1037         code << "  CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1038       }
1039       break;
1040     case CEED_EVAL_INTERP:
1041       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1042       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1043       break;
1044     case CEED_EVAL_GRAD:
1045       if (basis_data->d_collograd1d) {
1046         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1047         code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1048       } else {
1049         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1050         code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1051       }
1052       break;
1053     case CEED_EVAL_WEIGHT:
1054       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
1055       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1056       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1057       data->W = basis_data->d_qweight1d;
1058       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1059       break; // No action
1060     case CEED_EVAL_DIV:
1061       break; // TODO: Not implemented
1062     case CEED_EVAL_CURL:
1063       break; // TODO: Not implemented
1064     }
1065   }
1066 
1067   // Q function
1068   code << "\n// Output field setup\n";
1069   for (CeedInt i = 0; i < numoutputfields; i++) {
1070       code << "  // -- Output field "<<i<<" --\n";
1071     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1072     CeedChk(ierr);
1073     if (emode==CEED_EVAL_GRAD)
1074     {
1075       if (basis_data->d_collograd1d) {
1076         //Accumulator for gradient slices
1077         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1078         code << "  for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1079         code << "    for (CeedInt j = 0; j < Q1d; ++j) {\n";
1080         code << "      r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1081         code << "    }\n";
1082         code << "  }\n";
1083       } else {
1084         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1085       }
1086     }
1087     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1088     {
1089       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1090     }
1091   }
1092   // We treat quadrature points per slice in 3d to save registers
1093   if (basis_data->d_collograd1d) {
1094     code << "\n  // Note: Collocated Gradient\n";
1095     code << "#pragma unroll\n";
1096     code << "for (CeedInt q=0; q<Q1d; q++) {\n";
1097     for (CeedInt i = 0; i < numinputfields; i++) {
1098       code << "  // -- Input field "<<i<<" --\n";
1099       // Get elemsize, emode, ncomp
1100       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1101       CeedChk(ierr);
1102       // Basis action
1103       code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1104       switch (emode) {
1105       case CEED_EVAL_NONE:
1106         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1107 
1108         bool isStrided;
1109         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1110         if (!isStrided) {
1111           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1112           CeedChk(ierr);
1113           code << "  const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1114           CeedInt compstride;
1115           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1116           code << "  // CompStride: "<<compstride<<"\n";
1117           ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1118           data->indices.in[i] = restr_data->d_ind;
1119           code << "  readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1120         } else {
1121           bool backendstrides;
1122           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1123           CeedChk(ierr);
1124           CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1125           if (!backendstrides) {
1126             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1127             CeedChk(ierr);
1128           }
1129           code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1130           code << "  readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1131         }
1132         break;
1133       case CEED_EVAL_INTERP:
1134         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1135         code << "  for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1136         code << "    r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1137         code << "  }\n";
1138         break;
1139       case CEED_EVAL_GRAD:
1140         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1141         code << "  gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1142         break;
1143       case CEED_EVAL_WEIGHT:
1144         code << "  CeedScalar r_q"<<i<<"[1];\n";
1145         code << "  r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1146         break; // No action
1147       case CEED_EVAL_DIV:
1148         break; // TODO: Not implemented
1149       case CEED_EVAL_CURL:
1150         break; // TODO: Not implemented
1151       }
1152     }
1153     for (CeedInt i = 0; i < numoutputfields; i++) {
1154       code << "  // -- Output field "<<i<<" --\n";
1155       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1156       CeedChk(ierr);
1157       // Basis action
1158       switch (emode) {
1159       case CEED_EVAL_NONE:
1160         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1161         break; // No action
1162       case CEED_EVAL_INTERP:
1163         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1164         break;
1165       case CEED_EVAL_GRAD:
1166         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1167         break;
1168       case CEED_EVAL_WEIGHT:
1169         break; // Should not occur
1170       case CEED_EVAL_DIV:
1171         break; // TODO: Not implemented
1172       case CEED_EVAL_CURL:
1173         break; // TODO: Not implemented
1174       }
1175     }
1176   } else {
1177       code << "\n  // Note: No Collocated Gradient\n";
1178     for (CeedInt i = 0; i < numinputfields; i++) {
1179       code << "  // -- Input field "<<i<<" --\n";
1180       code << "  CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1181     }
1182     for (CeedInt i = 0; i < numoutputfields; i++) {
1183       code << "  // -- Output field "<<i<<" --\n";
1184       code << "  CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1185     }
1186   }
1187   code << "  // QFunction Inputs and outputs\n";
1188   code << "  CeedScalar* in["<<numinputfields<<"];\n";
1189   for (CeedInt i = 0; i < numinputfields; i++) {
1190     code << "  // -- Input field "<<i<<" --\n";
1191     code << "  in["<<i<<"] = r_q"<<i<<";\n";
1192   }
1193   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
1194   for (CeedInt i = 0; i < numoutputfields; i++) {
1195     code << "  // -- Output field "<<i<<" --\n";
1196     code << "  out["<<i<<"] = r_qq"<<i<<";\n";
1197   }
1198   code << "\n  // Apply QFunction\n";
1199   string qFunctionName(qf_data->qFunctionName);
1200   code << "  "<<qFunctionName<<"(ctx, ";
1201   if (dim != 3 || basis_data->d_collograd1d) {
1202     code << "1 ";
1203   } else {
1204     code << "Q1d ";
1205   }
1206   code << ", in, out);\n";
1207   if (basis_data->d_collograd1d) {
1208     code << "\n  // Note: Collocated Gradient\n";
1209     for (CeedInt i = 0; i < numoutputfields; i++) {
1210       code << "  // -- Output field "<<i<<" --\n";
1211       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1212       CeedChk(ierr);
1213       // Basis action
1214       code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1215       switch (emode) {
1216       case CEED_EVAL_NONE:
1217         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1218         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1219         code << "  }\n";
1220         break; // No action
1221       case CEED_EVAL_INTERP:
1222         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1223         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1224         code << "  }\n";
1225         break;
1226       case CEED_EVAL_GRAD:
1227         code << "  gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1228         break;
1229       case CEED_EVAL_WEIGHT:
1230         break; // Should not occur
1231       case CEED_EVAL_DIV:
1232         break; // TODO: Not implemented
1233       case CEED_EVAL_CURL:
1234         break; // TODO: Not implemented
1235       }
1236     }
1237     code << "}\n";
1238   }
1239 
1240   // Output basis apply if needed
1241   // Generate the correct eval mode code for each output
1242   code << "\n// Output field basis action and restrictions\n";
1243   for (CeedInt i = 0; i < numoutputfields; i++) {
1244     code << "  // -- Output field "<<i<<" --\n";
1245     // Get elemsize, emode, ncomp
1246     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1247     CeedChk(ierr);
1248     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1249     CeedChk(ierr);
1250     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1251     CeedChk(ierr);
1252     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1253     CeedChk(ierr);
1254     // Basis action
1255     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1256     switch (emode) {
1257     case CEED_EVAL_NONE:
1258       code << "  CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1259       break; // No action
1260     case CEED_EVAL_INTERP:
1261       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1262       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1263       break;
1264     case CEED_EVAL_GRAD:
1265       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1266       if (basis_data->d_collograd1d) {
1267         code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1268       } else {
1269         code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1270       }
1271       break;
1272     case CEED_EVAL_WEIGHT: {
1273       Ceed ceed;
1274       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1275       return CeedError(ceed, 1,
1276                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1277       break; // Should not occur
1278     }
1279     case CEED_EVAL_DIV:
1280       break; // TODO: Not implemented
1281     case CEED_EVAL_CURL:
1282       break; // TODO: Not implemented
1283     }
1284     // Restriction
1285       bool isStrided;
1286       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1287     if (!isStrided) {
1288       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1289       CeedChk(ierr);
1290       code << "  const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1291       CeedInt compstride;
1292       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1293       code << "  // CompStride: "<<compstride<<"\n";
1294       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1295       data->indices.out[i] = restr_data->d_ind;
1296       code << "  writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1297     } else {
1298       bool backendstrides;
1299       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1300       CeedChk(ierr);
1301       CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1302       if (!backendstrides) {
1303         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1304         CeedChk(ierr);
1305       }
1306       code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1307       code << "  writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1308     }
1309   }
1310 
1311   code << "  }\n";
1312   code << "}\n\n";
1313 
1314   // View kernel for debugging
1315   // std::cout << code.str();
1316 
1317   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0);
1318   CeedChk(ierr);
1319   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1320   CeedChk(ierr);
1321 
1322   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1323   return 0;
1324 }
1325 //------------------------------------------------------------------------------
1326