xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision fc6bbced940bdfff573ee2079e5cb2caff7bebe9)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 #define CEED_DEBUG_COLOR 12
17 
18 #include "ceed-cuda-gen.h"
19 #include <iostream>
20 #include <sstream>
21 #include "../cuda-reg/ceed-cuda-reg.h"
22 #include "../cuda-shared/ceed-cuda-shared.h"
23 
24 static const char *atomicAdd = QUOTE(
25 //------------------------------------------------------------------------------
26 // Atomic add, for older CUDA
27 //------------------------------------------------------------------------------
28 __device__ double atomicAdd(double *address, double val) {
29   unsigned long long int *address_as_ull = (unsigned long long int *)address;
30   unsigned long long int old = *address_as_ull, assumed;
31   do {
32     assumed = old;
33     old =
34       atomicCAS(address_as_ull, assumed,
35                 __double_as_longlong(val +
36                                      __longlong_as_double(assumed)));
37     // Note: uses integer comparison to avoid hang in case of NaN
38     // (since NaN != NaN)
39   } while (assumed != old);
40   return __longlong_as_double(old);
41 }
42 );
43 
44 static const char *deviceFunctions = QUOTE(
45 
46 //------------------------------------------------------------------------------
47 // Typedefs
48 //------------------------------------------------------------------------------
49 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
50 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
51 
52 typedef struct {
53   CeedInt tidx;
54   CeedInt tidy;
55   CeedInt tidz;
56   CeedInt tid;
57   CeedScalar* slice;
58 } BackendData;
59 
60 //------------------------------------------------------------------------------
61 // Load matrices for basis actions
62 //------------------------------------------------------------------------------
63 template <int P, int Q>
64 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
65   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
66     B[i] = d_B[i];
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D
71 //------------------------------------------------------------------------------
72 
73 //------------------------------------------------------------------------------
74 // L-vector -> E-vector, offsets provided
75 //------------------------------------------------------------------------------
76 template <int NCOMP, int COMPSTRIDE, int P1d>
77 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
78   if (data.tidx < P1d) {
79     const CeedInt node = data.tidx;
80     const CeedInt ind = indices[node + elem * P1d];
81     for (CeedInt comp = 0; comp < NCOMP; ++comp)
82       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
83   }
84 }
85 
86 //------------------------------------------------------------------------------
87 // L-vector -> E-vector, strided
88 //------------------------------------------------------------------------------
89 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
90 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
91   if (data.tidx < P1d) {
92     const CeedInt node = data.tidx;
93     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
94     for (CeedInt comp = 0; comp < NCOMP; ++comp)
95       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
96   }
97 }
98 
99 //------------------------------------------------------------------------------
100 // E-vector -> L-vector, offsets provided
101 //------------------------------------------------------------------------------
102 template <int NCOMP, int COMPSTRIDE, int P1d>
103 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
104   if (data.tidx < P1d) {
105     const CeedInt node = data.tidx;
106     const CeedInt ind = indices[node + elem * P1d];
107     for (CeedInt comp = 0; comp < NCOMP; ++comp)
108       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
109   }
110 }
111 
112 //------------------------------------------------------------------------------
113 // E-vector -> L-vector, strided
114 //------------------------------------------------------------------------------
115 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
116 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
117   if (data.tidx < P1d) {
118     const CeedInt node = data.tidx;
119     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
120     for (CeedInt comp = 0; comp < NCOMP; ++comp)
121       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
122   }
123 }
124 
125 //------------------------------------------------------------------------------
126 // 1D tensor contraction x
127 //------------------------------------------------------------------------------
128 template <int NCOMP, int P1d, int Q1d>
129 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130   data.slice[data.tidx] = *U;
131   __syncthreads();
132   *V = 0.0;
133   for (CeedInt i = 0; i < P1d; ++i)
134     *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
135   __syncthreads();
136 }
137 
138 //------------------------------------------------------------------------------
139 // 1D transpose tensor contraction x
140 //------------------------------------------------------------------------------
141 template <int NCOMP, int P1d, int Q1d>
142 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
143   data.slice[data.tidx] = *U;
144   __syncthreads();
145   *V = 0.0;
146   for (CeedInt i = 0; i < Q1d; ++i)
147     *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
148   __syncthreads();
149 }
150 
151 //------------------------------------------------------------------------------
152 // 1D interpolate to quadrature points
153 //------------------------------------------------------------------------------
154 template <int NCOMP, int P1d, int Q1d>
155 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
156   for (CeedInt comp = 0; comp < NCOMP; comp++)
157     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
158 }
159 
160 //------------------------------------------------------------------------------
161 // 1D interpolate transpose
162 //------------------------------------------------------------------------------
163 template <int NCOMP, int P1d, int Q1d>
164 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
165   for (CeedInt comp=0; comp<NCOMP; comp++)
166     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
167 }
168 
169 //------------------------------------------------------------------------------
170 // 1D derivatives at quadrature points
171 //------------------------------------------------------------------------------
172 template <int NCOMP, int P1d, int Q1d>
173 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
174   for (CeedInt comp = 0; comp < NCOMP; comp++)
175     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
176 }
177 
178 //------------------------------------------------------------------------------
179 // 1D derivatives transpose
180 //------------------------------------------------------------------------------
181 template <int NCOMP, int P1d, int Q1d>
182 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
183   for (CeedInt comp = 0; comp < NCOMP; comp++)
184     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
185 }
186 
187 //------------------------------------------------------------------------------
188 // 2D
189 //------------------------------------------------------------------------------
190 
191 //------------------------------------------------------------------------------
192 // L-vector -> E-vector, offsets provided
193 //------------------------------------------------------------------------------
194 template <int NCOMP, int COMPSTRIDE, int P1d>
195 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
196   if (data.tidx < P1d && data.tidy < P1d) {
197     const CeedInt node = data.tidx + data.tidy*P1d;
198     const CeedInt ind = indices[node + elem * P1d*P1d];
199     for (CeedInt comp = 0; comp < NCOMP; ++comp)
200       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
201   }
202 }
203 
204 //------------------------------------------------------------------------------
205 // L-vector -> E-vector, strided
206 //------------------------------------------------------------------------------
207 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
208 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
209   if (data.tidx < P1d && data.tidy < P1d) {
210     const CeedInt node = data.tidx + data.tidy*P1d;
211     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
212     for (CeedInt comp = 0; comp < NCOMP; ++comp)
213       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
214   }
215 }
216 
217 //------------------------------------------------------------------------------
218 // E-vector -> L-vector, offsets provided
219 //------------------------------------------------------------------------------
220 template <int NCOMP, int COMPSTRIDE, int P1d>
221 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
222   if (data.tidx < P1d && data.tidy < P1d) {
223     const CeedInt node = data.tidx + data.tidy*P1d;
224     const CeedInt ind = indices[node + elem * P1d*P1d];
225     for (CeedInt comp = 0; comp < NCOMP; ++comp)
226       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
227   }
228 }
229 
230 //------------------------------------------------------------------------------
231 // E-vector -> L-vector, strided
232 //------------------------------------------------------------------------------
233 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
234 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
235   if (data.tidx < P1d && data.tidy < P1d) {
236     const CeedInt node = data.tidx + data.tidy*P1d;
237     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
238     for (CeedInt comp = 0; comp < NCOMP; ++comp)
239       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
240   }
241 }
242 
243 //------------------------------------------------------------------------------
244 // 2D tensor contraction x
245 //------------------------------------------------------------------------------
246 template <int NCOMP, int P1d, int Q1d>
247 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
248   data.slice[data.tidx+data.tidy*Q1d] = *U;
249   __syncthreads();
250   *V = 0.0;
251   for (CeedInt i = 0; i < P1d; ++i)
252     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
253   __syncthreads();
254 }
255 
256 //------------------------------------------------------------------------------
257 // 2D tensor contract y
258 //------------------------------------------------------------------------------
259 template <int NCOMP, int P1d, int Q1d>
260 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
261   data.slice[data.tidx+data.tidy*Q1d] = *U;
262   __syncthreads();
263   *V = 0.0;
264   for (CeedInt i = 0; i < P1d; ++i)
265     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
266   __syncthreads();
267 }
268 
269 //------------------------------------------------------------------------------
270 // 2D transpose tensor contract y
271 //------------------------------------------------------------------------------
272 template <int NCOMP, int P1d, int Q1d>
273 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
274   data.slice[data.tidx+data.tidy*Q1d] = *U;
275   __syncthreads();
276   *V = 0.0;
277   if (data.tidy < P1d)
278     for (CeedInt i = 0; i < Q1d; ++i)
279       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction
280   __syncthreads();
281 }
282 
283 //------------------------------------------------------------------------------
284 // 2D transpose tensor contract x
285 //------------------------------------------------------------------------------
286 template <int NCOMP, int P1d, int Q1d>
287 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
288   data.slice[data.tidx+data.tidy*Q1d] = *U;
289   __syncthreads();
290   *V = 0.0;
291   if (data.tidx < P1d)
292     for (CeedInt i = 0; i < Q1d; ++i)
293       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
294   __syncthreads();
295 }
296 
297 //------------------------------------------------------------------------------
298 // 2D transpose tensor contract and add x
299 //------------------------------------------------------------------------------
300 template <int NCOMP, int P1d, int Q1d>
301 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
302   data.slice[data.tidx+data.tidy*Q1d] = *U;
303   __syncthreads();
304   if (data.tidx < P1d)
305     for (CeedInt i = 0; i < Q1d; ++i)
306       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction
307   __syncthreads();
308 }
309 
310 //------------------------------------------------------------------------------
311 // 2D interpolate to quadrature points
312 //------------------------------------------------------------------------------
313 template <int NCOMP, int P1d, int Q1d>
314 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
315   CeedScalar r_t[1];
316   for (CeedInt comp = 0; comp < NCOMP; comp++) {
317     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
318     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
319   }
320 }
321 
322 //------------------------------------------------------------------------------
323 // 2D interpolate transpose
324 //------------------------------------------------------------------------------
325 template <int NCOMP, int P1d, int Q1d>
326 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
327   CeedScalar r_t[1];
328   for (CeedInt comp = 0; comp < NCOMP; comp++) {
329     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
330     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
331   }
332 }
333 
334 //------------------------------------------------------------------------------
335 // 2D derivatives at quadrature points
336 //------------------------------------------------------------------------------
337 template <int NCOMP, int P1d, int Q1d>
338 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
339   CeedScalar r_t[1];
340   for (CeedInt comp = 0; comp < NCOMP; comp++) {
341     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
342     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
343     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
344     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
345   }
346 }
347 
348 //------------------------------------------------------------------------------
349 // 2D derivatives transpose
350 //------------------------------------------------------------------------------
351 template <int NCOMP, int P1d, int Q1d>
352 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
353   CeedScalar r_t[1];
354   for (CeedInt comp = 0; comp < NCOMP; comp++) {
355     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
356     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
357     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
358     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
359   }
360 }
361 
362 //------------------------------------------------------------------------------
363 // 3D
364 //------------------------------------------------------------------------------
365 
366 //------------------------------------------------------------------------------
367 // L-vector -> E-vector, offsets provided
368 //------------------------------------------------------------------------------
369 template <int NCOMP, int COMPSTRIDE, int P1d>
370 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
371   if (data.tidx < P1d && data.tidy < P1d)
372     for (CeedInt z = 0; z < P1d; ++z) {
373       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
374       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
375       for (CeedInt comp = 0; comp < NCOMP; ++comp)
376         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
377     }
378 }
379 
380 //------------------------------------------------------------------------------
381 // L-vector -> E-vector, strided
382 //------------------------------------------------------------------------------
383 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
384 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
385   if (data.tidx < P1d && data.tidy < P1d)
386     for (CeedInt z = 0; z < P1d; ++z) {
387       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
388       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
389       for (CeedInt comp = 0; comp < NCOMP; ++comp)
390         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
391     }
392 }
393 
394 //------------------------------------------------------------------------------
395 // E-vector -> Q-vector, offests provided
396 //------------------------------------------------------------------------------
397 template <int NCOMP, int COMPSTRIDE, int Q1d>
398 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
399   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
400   const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
401   for (CeedInt comp = 0; comp < NCOMP; ++comp)
402     r_u[comp] = d_u[ind + COMPSTRIDE * comp];
403 }
404 
405 //------------------------------------------------------------------------------
406 // E-vector -> Q-vector, strided
407 //------------------------------------------------------------------------------
408 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
409 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
410   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
411   const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
412   for (CeedInt comp = 0; comp < NCOMP; ++comp)
413     r_u[comp] = d_u[ind + comp * STRIDES_COMP];
414 }
415 
416 //------------------------------------------------------------------------------
417 // E-vector -> L-vector, offsets provided
418 //------------------------------------------------------------------------------
419 template <int NCOMP, int COMPSTRIDE, int P1d>
420 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
421   if (data.tidx < P1d && data.tidy < P1d)
422     for (CeedInt z = 0; z < P1d; ++z) {
423       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
424       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
425       for (CeedInt comp = 0; comp < NCOMP; ++comp)
426         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
427     }
428 }
429 
430 //------------------------------------------------------------------------------
431 // E-vector -> L-vector, strided
432 //------------------------------------------------------------------------------
433 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
434 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
435   if (data.tidx < P1d && data.tidy < P1d)
436     for (CeedInt z = 0; z < P1d; ++z) {
437       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
438       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
439       for (CeedInt comp = 0; comp < NCOMP; ++comp)
440         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
441     }
442 }
443 
444 //------------------------------------------------------------------------------
445 // 3D tensor contract x
446 //------------------------------------------------------------------------------
447 template <int NCOMP, int P1d, int Q1d>
448 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
449   CeedScalar r_B[P1d];
450   for (CeedInt i = 0; i < P1d; ++i)
451     r_B[i] = B[i + data.tidx*P1d];
452 
453   for (CeedInt k = 0; k < P1d; ++k) {
454     data.slice[data.tidx+data.tidy*Q1d] = U[k];
455     __syncthreads();
456     V[k] = 0.0;
457     for (CeedInt i = 0; i < P1d; ++i)
458       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
459     __syncthreads();
460   }
461 }
462 
463 //------------------------------------------------------------------------------
464 // 3D tensor contract y
465 //------------------------------------------------------------------------------
466 template <int NCOMP, int P1d, int Q1d>
467 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
468   CeedScalar r_B[P1d];
469   for (CeedInt i = 0; i < P1d; ++i)
470     r_B[i] = B[i + data.tidy*P1d];
471 
472   for (CeedInt k = 0; k < P1d; ++k) {
473     data.slice[data.tidx+data.tidy*Q1d] = U[k];
474     __syncthreads();
475     V[k] = 0.0;
476     for (CeedInt i = 0; i < P1d; ++i)
477       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
478     __syncthreads();
479   }
480 }
481 
482 //------------------------------------------------------------------------------
483 // 3D tensor contract z
484 //------------------------------------------------------------------------------
485 template <int NCOMP, int P1d, int Q1d>
486 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
487   for (CeedInt k = 0; k < Q1d; ++k) {
488     V[k] = 0.0;
489     for (CeedInt i = 0; i < P1d; ++i)
490       V[k] += B[i + k*P1d] * U[i]; // Contract z direction
491   }
492 }
493 
494 //------------------------------------------------------------------------------
495 // 3D transpose tensor contract z
496 //------------------------------------------------------------------------------
497 template <int NCOMP, int P1d, int Q1d>
498 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
499   for (CeedInt k = 0; k < Q1d; ++k) {
500     V[k] = 0.0;
501     if (k < P1d)
502       for (CeedInt i = 0; i < Q1d; ++i)
503         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
504   }
505 }
506 
507 //------------------------------------------------------------------------------
508 // 3D transpose tensor contract y
509 //------------------------------------------------------------------------------
510 template <int NCOMP, int P1d, int Q1d>
511 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
512   CeedScalar r_B[Q1d];
513   for (CeedInt i = 0; i < Q1d; ++i)
514     r_B[i] = B[data.tidy + i*P1d];
515 
516   for (CeedInt k = 0; k < P1d; ++k) {
517     data.slice[data.tidx+data.tidy*Q1d] = U[k];
518     __syncthreads();
519     V[k] = 0.0;
520     if (data.tidy < P1d)
521       for (CeedInt i = 0; i < Q1d; ++i)
522         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
523     __syncthreads();
524   }
525 }
526 
527 //------------------------------------------------------------------------------
528 // 3D transpose tensor contract add y
529 //------------------------------------------------------------------------------
530 template <int NCOMP, int P1d, int Q1d>
531 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
532   CeedScalar r_B[Q1d];
533   for (CeedInt i = 0; i < Q1d; ++i)
534     r_B[i] = B[data.tidy + i*P1d];
535 
536   for (CeedInt k = 0; k < P1d; ++k) {
537     data.slice[data.tidx+data.tidy*Q1d] = U[k];
538     __syncthreads();
539     if (data.tidy < P1d)
540       for (CeedInt i = 0; i < Q1d; ++i)
541         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction
542     __syncthreads();
543   }
544 }
545 
546 //------------------------------------------------------------------------------
547 // 3D transpose tensor contract x
548 //------------------------------------------------------------------------------
549 template <int NCOMP, int P1d, int Q1d>
550 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
551   CeedScalar r_B[Q1d];
552   for (CeedInt i = 0; i < Q1d; ++i)
553     r_B[i] = B[data.tidx + i*P1d];
554 
555   for (CeedInt k = 0; k < P1d; ++k) {
556     data.slice[data.tidx+data.tidy*Q1d] = U[k];
557     __syncthreads();
558     V[k] = 0.0;
559     if (data.tidx < P1d)
560       for (CeedInt i = 0; i < Q1d; ++i)
561         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
562     __syncthreads();
563   }
564 }
565 
566 //------------------------------------------------------------------------------
567 // 3D transpose tensor contract add x
568 //------------------------------------------------------------------------------
569 template <int NCOMP, int P1d, int Q1d>
570 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
571   CeedScalar r_B[Q1d];
572   for (CeedInt i = 0; i < Q1d; ++i)
573     r_B[i] = B[data.tidx + i*P1d];
574 
575   for (CeedInt k = 0; k < P1d; ++k) {
576     data.slice[data.tidx+data.tidy*Q1d] = U[k];
577     __syncthreads();
578     if (data.tidx < P1d)
579       for (CeedInt i = 0; i < Q1d; ++i)
580         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction
581     __syncthreads();
582   }
583 }
584 
585 //------------------------------------------------------------------------------
586 // 3D interpolate to quadrature points
587 //------------------------------------------------------------------------------
588 template <int NCOMP, int P1d, int Q1d>
589 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
590   CeedScalar r_t1[Q1d];
591   CeedScalar r_t2[Q1d];
592   for (CeedInt comp = 0; comp < NCOMP; comp++) {
593     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
594     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
595     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
596   }
597 }
598 
599 //------------------------------------------------------------------------------
600 // 3D interpolate transpose
601 //------------------------------------------------------------------------------
602 template <int NCOMP, int P1d, int Q1d>
603 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
604   CeedScalar r_t1[Q1d];
605   CeedScalar r_t2[Q1d];
606   for (CeedInt comp = 0; comp < NCOMP; comp++) {
607     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
608     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
609     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
610   }
611 }
612 
613 //------------------------------------------------------------------------------
614 // 3D derivatives at quadrature points
615 //------------------------------------------------------------------------------
616 template <int NCOMP, int P1d, int Q1d>
617 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
618   CeedScalar r_t1[Q1d];
619   CeedScalar r_t2[Q1d];
620   for (CeedInt comp = 0; comp < NCOMP; comp++) {
621     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
622     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
623     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
624     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
625     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
626     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
627     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
628     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
629     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
630   }
631 }
632 
633 //------------------------------------------------------------------------------
634 // 3D derivatives transpose
635 //------------------------------------------------------------------------------
636 template <int NCOMP, int P1d, int Q1d>
637 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
638   CeedScalar r_t1[Q1d];
639   CeedScalar r_t2[Q1d];
640   for (CeedInt comp = 0; comp < NCOMP; comp++) {
641     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
642     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
643     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
644     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
645     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
646     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
647     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
648     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
649     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
650   }
651 }
652 
653 //------------------------------------------------------------------------------
654 // 3D collocated derivatives computation
655 //------------------------------------------------------------------------------
656 template <int NCOMP, int Q1d>
657 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
658   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
659     data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d];
660     __syncthreads();
661     // X derivative
662     r_V[comp+0*NCOMP] = 0.0;
663     for (CeedInt i = 0; i < Q1d; ++i)
664       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
665     // Y derivative
666     r_V[comp+1*NCOMP] = 0.0;
667     for (CeedInt i = 0; i < Q1d; ++i)
668       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
669     // Z derivative
670     r_V[comp+2*NCOMP] = 0.0;
671     for (CeedInt i = 0; i < Q1d; ++i)
672       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
673     __syncthreads();
674   }
675 }
676 
677 //------------------------------------------------------------------------------
678 // 3D collocated derivatives transpose
679 //------------------------------------------------------------------------------
680 template <int NCOMP, int Q1d>
681 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
682   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
683     // X derivative
684     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP];
685     __syncthreads();
686     for (CeedInt i = 0; i < Q1d; ++i)
687       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative)
688     __syncthreads();
689     // Y derivative
690     data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP];
691     __syncthreads();
692     for (CeedInt i = 0; i < Q1d; ++i)
693       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative)
694     __syncthreads();
695     // Z derivative
696     for (CeedInt i = 0; i < Q1d; ++i)
697       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
698   }
699 }
700 
701 //------------------------------------------------------------------------------
702 // 1D quadrature weights
703 //------------------------------------------------------------------------------
704 template <int Q1d>
705 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
706   *w = qweight1d[data.tidx];
707 }
708 
709 //------------------------------------------------------------------------------
710 // 2D quadrature weights
711 //------------------------------------------------------------------------------
712 template <int Q1d>
713 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
714   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
715 }
716 
717 //------------------------------------------------------------------------------
718 // 3D quadrature weights
719 //------------------------------------------------------------------------------
720 template <int Q1d>
721 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
722   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
723   for (CeedInt z = 0; z < Q1d; ++z)
724     w[z] = pw*qweight1d[z];
725 }
726 
727 );
728 //------------------------------------------------------------------------------
729 // Build singe operator kernel
730 //------------------------------------------------------------------------------
731 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
732 
733   using std::ostringstream;
734   using std::string;
735   int ierr;
736   bool setupdone;
737   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
738   if (setupdone) return 0;
739   Ceed ceed;
740   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
741   CeedOperator_Cuda_gen *data;
742   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
743   CeedQFunction qf;
744   CeedQFunction_Cuda_gen *qf_data;
745   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
746   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
747   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
748           numoutputfields, ncomp, dim = 0, lsize;
749   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
750   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
751   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
752   CeedChk(ierr);
753   CeedOperatorField *opinputfields, *opoutputfields;
754   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
755   CeedChk(ierr);
756   CeedQFunctionField *qfinputfields, *qfoutputfields;
757   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
758   CeedChk(ierr);
759   CeedEvalMode emode;
760   CeedBasis basis;
761   CeedBasis_Cuda_shared *basis_data;
762   CeedElemRestriction Erestrict;
763   CeedElemRestriction_Cuda_reg *restr_data;
764 
765   ostringstream code;
766   string devFunctions(deviceFunctions);
767 
768   // Add atomicAdd function for old NVidia architectures
769   struct cudaDeviceProp prop;
770   Ceed_Cuda *ceed_data;
771   ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr);
772   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
773   if (prop.major<6){
774     code << atomicAdd;
775   }
776 
777   code << devFunctions;
778 
779   string qFunction(qf_data->qFunctionSource);
780 
781   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
782   code << "\n#define CeedPragmaSIMD\n";
783 
784   // Find dim and Q1d
785   bool collograd = false;
786   for (CeedInt i = 0; i < numinputfields; i++) {
787     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
788     if (basis != CEED_BASIS_COLLOCATED) {
789       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
790 
791       // Check for collocated gradient
792       if (basis_data->d_collograd1d)
793         collograd = true;
794 
795       // Collect dim and Q1d
796       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
797       bool isTensor;
798       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
799       if (isTensor) {
800         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
801       } else {
802         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
803         }
804     }
805   }
806   // Check output bases for Q1d, dim as well
807   //   The only imput basis might be CEED_BASIS_COLLOCATED
808   for (CeedInt i = 0; i < numoutputfields; i++) {
809     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
810     if (basis != CEED_BASIS_COLLOCATED) {
811       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
812       // Collect dim and Q1d
813       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
814       bool isTensor;
815       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
816       if (isTensor) {
817         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
818       } else {
819         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
820         }
821     }
822   }
823   data->dim = dim;
824   data->Q1d = Q1d;
825 
826   // Define CEED_Q_VLA
827   if (dim != 3 || collograd) {
828     code << "\n#define CEED_Q_VLA 1\n\n";
829   } else {
830     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
831   }
832 
833   code << qFunction;
834 
835   // Setup
836   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
837   // Input Evecs and Restriction
838   for (CeedInt i = 0; i < numinputfields; i++) {
839     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
840     CeedChk(ierr);
841     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
842       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
843     }
844   }
845 
846   for (CeedInt i = 0; i < numoutputfields; i++) {
847     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
848   }
849 
850   code << "const CeedInt Dim = "<<dim<<";\n";
851   code << "const CeedInt Q1d = "<<Q1d<<";\n";
852 
853   code << "extern __shared__ CeedScalar slice[];\n";
854   code << "BackendData data;\n";
855   code << "data.tidx = threadIdx.x;\n";
856   code << "data.tidy = threadIdx.y;\n";
857   code << "data.tidz = threadIdx.z;\n";
858   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
859   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
860 
861   code << "\n// Input field constants and basis data\n";
862   //Initialize constants, and matrices B and G
863   for (CeedInt i = 0; i < numinputfields; i++) {
864     code << "// -- Input field "<<i<<" --\n";
865     // Get elemsize, emode, ncomp
866     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
867     CeedChk(ierr);
868     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
869     CeedChk(ierr);
870     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
871     CeedChk(ierr);
872     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
873     CeedChk(ierr);
874 
875     // Set field constants
876     if (emode != CEED_EVAL_WEIGHT) {
877       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
878       if (basis != CEED_BASIS_COLLOCATED) {
879         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
880         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
881       } else {
882         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
883       }
884       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
885     }
886 
887     // Load basis data
888     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
889     switch (emode) {
890     case CEED_EVAL_NONE:
891       break;
892     case CEED_EVAL_INTERP:
893       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
894       data->B.in[i] = basis_data->d_interp1d;
895       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
896       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
897       break;
898     case CEED_EVAL_GRAD:
899       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
900       data->B.in[i] = basis_data->d_interp1d;
901       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
902       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
903       if (basis_data->d_collograd1d) {
904         data->G.in[i] = basis_data->d_collograd1d;
905         code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
906         code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
907       } else {
908         data->G.in[i] = basis_data->d_grad1d;
909         code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
910         code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
911       }
912       break;
913     case CEED_EVAL_WEIGHT:
914       break; // No action
915     case CEED_EVAL_DIV:
916       break; // TODO: Not implemented
917     case CEED_EVAL_CURL:
918       break; // TODO: Not implemented
919     }
920   }
921 
922   code << "\n// Output field constants and basis data\n";
923   for (CeedInt i = 0; i < numoutputfields; i++) {
924     code << "// -- Output field "<<i<<" --\n";
925     // Get elemsize, emode, ncomp
926     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
927     CeedChk(ierr);
928     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
929     CeedChk(ierr);
930     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
931     CeedChk(ierr);
932     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
933     CeedChk(ierr);
934 
935     // Set field constants
936     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
937     if (basis != CEED_BASIS_COLLOCATED) {
938       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
939       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
940     } else {
941       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
942     }
943     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
944 
945     // Load basis data
946     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
947     switch (emode) {
948     case CEED_EVAL_NONE:
949       break; // No action
950     case CEED_EVAL_INTERP:
951       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
952       data->B.out[i] = basis_data->d_interp1d;
953       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
954       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
955       break;
956     case CEED_EVAL_GRAD:
957       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
958       data->B.out[i] = basis_data->d_interp1d;
959       code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
960       code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
961       if (basis_data->d_collograd1d) {
962         data->G.out[i] = basis_data->d_collograd1d;
963         code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
964         code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
965       } else {
966         data->G.out[i] = basis_data->d_grad1d;
967         code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
968         code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
969       }
970       break;
971     case CEED_EVAL_WEIGHT: {
972       Ceed ceed;
973       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
974       return CeedError(ceed, 1,
975                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
976       break; // Should not occur
977     }
978     case CEED_EVAL_DIV:
979       break; // TODO: Not implemented
980     case CEED_EVAL_CURL:
981       break; // TODO: Not implemented
982     }
983   }
984   code << "\n";
985   code << "__syncthreads();\n";
986   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
987   // Input basis apply if needed
988   // Generate the correct eval mode code for each input
989   code << "\n// Input field restrictions and basis actions\n";
990   for (CeedInt i = 0; i < numinputfields; i++) {
991     code << "  // -- Input field "<<i<<" --\n";
992     // Get elemsize, emode, ncomp
993     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
994     CeedChk(ierr);
995     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
996     CeedChk(ierr);
997     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
998     CeedChk(ierr);
999     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1000     CeedChk(ierr);
1001 
1002     // Restriction
1003     if (emode != CEED_EVAL_WEIGHT &&
1004         !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) {
1005       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1006 
1007       bool isStrided;
1008       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1009       if (!isStrided) {
1010         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1011         CeedChk(ierr);
1012         code << "  const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1013         CeedInt compstride;
1014         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1015         code << "  // CompStride: "<<compstride<<"\n";
1016         ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1017         data->indices.in[i] = restr_data->d_ind;
1018         code << "  readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1019       } else {
1020         bool backendstrides;
1021         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1022         CeedChk(ierr);
1023         CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1024         if (!backendstrides) {
1025           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1026           CeedChk(ierr);
1027         }
1028         code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1029         code << "  readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1030       }
1031     }
1032 
1033     // Basis action
1034     code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1035     switch (emode) {
1036     case CEED_EVAL_NONE:
1037       if (!basis_data->d_collograd1d) {
1038         code << "  CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1039       }
1040       break;
1041     case CEED_EVAL_INTERP:
1042       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1043       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1044       break;
1045     case CEED_EVAL_GRAD:
1046       if (basis_data->d_collograd1d) {
1047         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1048         code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1049       } else {
1050         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1051         code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1052       }
1053       break;
1054     case CEED_EVAL_WEIGHT:
1055       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
1056       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1057       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1058       data->W = basis_data->d_qweight1d;
1059       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1060       break; // No action
1061     case CEED_EVAL_DIV:
1062       break; // TODO: Not implemented
1063     case CEED_EVAL_CURL:
1064       break; // TODO: Not implemented
1065     }
1066   }
1067 
1068   // Q function
1069   code << "\n// Output field setup\n";
1070   for (CeedInt i = 0; i < numoutputfields; i++) {
1071       code << "  // -- Output field "<<i<<" --\n";
1072     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1073     CeedChk(ierr);
1074     if (emode==CEED_EVAL_GRAD)
1075     {
1076       if (basis_data->d_collograd1d) {
1077         //Accumulator for gradient slices
1078         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1079         code << "  for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1080         code << "    for (CeedInt j = 0; j < Q1d; ++j) {\n";
1081         code << "      r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1082         code << "    }\n";
1083         code << "  }\n";
1084       } else {
1085         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1086       }
1087     }
1088     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1089     {
1090       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1091     }
1092   }
1093   // We treat quadrature points per slice in 3d to save registers
1094   if (basis_data->d_collograd1d) {
1095     code << "\n  // Note: Collocated Gradient\n";
1096     code << "#pragma unroll\n";
1097     code << "for (CeedInt q=0; q<Q1d; q++) {\n";
1098     for (CeedInt i = 0; i < numinputfields; i++) {
1099       code << "  // -- Input field "<<i<<" --\n";
1100       // Get elemsize, emode, ncomp
1101       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1102       CeedChk(ierr);
1103       // Basis action
1104       code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n";
1105       switch (emode) {
1106       case CEED_EVAL_NONE:
1107         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1108 
1109         bool isStrided;
1110         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1111         if (!isStrided) {
1112           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1113           CeedChk(ierr);
1114           code << "  const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1115           CeedInt compstride;
1116           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1117           code << "  // CompStride: "<<compstride<<"\n";
1118           ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1119           data->indices.in[i] = restr_data->d_ind;
1120           code << "  readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1121         } else {
1122           bool backendstrides;
1123           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1124           CeedChk(ierr);
1125           CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1126           if (!backendstrides) {
1127             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1128             CeedChk(ierr);
1129           }
1130           code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1131           code << "  readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1132         }
1133         break;
1134       case CEED_EVAL_INTERP:
1135         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1136         code << "  for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1137         code << "    r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1138         code << "  }\n";
1139         break;
1140       case CEED_EVAL_GRAD:
1141         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1142         code << "  gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1143         break;
1144       case CEED_EVAL_WEIGHT:
1145         code << "  CeedScalar r_q"<<i<<"[1];\n";
1146         code << "  r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1147         break; // No action
1148       case CEED_EVAL_DIV:
1149         break; // TODO: Not implemented
1150       case CEED_EVAL_CURL:
1151         break; // TODO: Not implemented
1152       }
1153     }
1154     for (CeedInt i = 0; i < numoutputfields; i++) {
1155       code << "  // -- Output field "<<i<<" --\n";
1156       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1157       CeedChk(ierr);
1158       // Basis action
1159       switch (emode) {
1160       case CEED_EVAL_NONE:
1161         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1162         break; // No action
1163       case CEED_EVAL_INTERP:
1164         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1165         break;
1166       case CEED_EVAL_GRAD:
1167         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1168         break;
1169       case CEED_EVAL_WEIGHT:
1170         break; // Should not occur
1171       case CEED_EVAL_DIV:
1172         break; // TODO: Not implemented
1173       case CEED_EVAL_CURL:
1174         break; // TODO: Not implemented
1175       }
1176     }
1177   } else {
1178       code << "\n  // Note: No Collocated Gradient\n";
1179     for (CeedInt i = 0; i < numinputfields; i++) {
1180       code << "  // -- Input field "<<i<<" --\n";
1181       code << "  CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1182     }
1183     for (CeedInt i = 0; i < numoutputfields; i++) {
1184       code << "  // -- Output field "<<i<<" --\n";
1185       code << "  CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1186     }
1187   }
1188   code << "  // QFunction Inputs and outputs\n";
1189   code << "  CeedScalar* in["<<numinputfields<<"];\n";
1190   for (CeedInt i = 0; i < numinputfields; i++) {
1191     code << "  // -- Input field "<<i<<" --\n";
1192     code << "  in["<<i<<"] = r_q"<<i<<";\n";
1193   }
1194   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
1195   for (CeedInt i = 0; i < numoutputfields; i++) {
1196     code << "  // -- Output field "<<i<<" --\n";
1197     code << "  out["<<i<<"] = r_qq"<<i<<";\n";
1198   }
1199   code << "\n  // Apply QFunction\n";
1200   string qFunctionName(qf_data->qFunctionName);
1201   code << "  "<<qFunctionName<<"(ctx, ";
1202   if (dim != 3 || basis_data->d_collograd1d) {
1203     code << "1 ";
1204   } else {
1205     code << "Q1d ";
1206   }
1207   code << ", in, out);\n";
1208   if (basis_data->d_collograd1d) {
1209     code << "\n  // Note: Collocated Gradient\n";
1210     for (CeedInt i = 0; i < numoutputfields; i++) {
1211       code << "  // -- Output field "<<i<<" --\n";
1212       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1213       CeedChk(ierr);
1214       // Basis action
1215       code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1216       switch (emode) {
1217       case CEED_EVAL_NONE:
1218         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1219         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1220         code << "  }\n";
1221         break; // No action
1222       case CEED_EVAL_INTERP:
1223         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1224         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1225         code << "  }\n";
1226         break;
1227       case CEED_EVAL_GRAD:
1228         code << "  gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1229         break;
1230       case CEED_EVAL_WEIGHT:
1231         break; // Should not occur
1232       case CEED_EVAL_DIV:
1233         break; // TODO: Not implemented
1234       case CEED_EVAL_CURL:
1235         break; // TODO: Not implemented
1236       }
1237     }
1238     code << "}\n";
1239   }
1240 
1241   // Output basis apply if needed
1242   // Generate the correct eval mode code for each output
1243   code << "\n// Output field basis action and restrictions\n";
1244   for (CeedInt i = 0; i < numoutputfields; i++) {
1245     code << "  // -- Output field "<<i<<" --\n";
1246     // Get elemsize, emode, ncomp
1247     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1248     CeedChk(ierr);
1249     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1250     CeedChk(ierr);
1251     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1252     CeedChk(ierr);
1253     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1254     CeedChk(ierr);
1255     // Basis action
1256     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1257     switch (emode) {
1258     case CEED_EVAL_NONE:
1259       code << "  CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1260       break; // No action
1261     case CEED_EVAL_INTERP:
1262       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1263       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1264       break;
1265     case CEED_EVAL_GRAD:
1266       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1267       if (basis_data->d_collograd1d) {
1268         code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1269       } else {
1270         code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1271       }
1272       break;
1273     case CEED_EVAL_WEIGHT: {
1274       Ceed ceed;
1275       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1276       return CeedError(ceed, 1,
1277                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1278       break; // Should not occur
1279     }
1280     case CEED_EVAL_DIV:
1281       break; // TODO: Not implemented
1282     case CEED_EVAL_CURL:
1283       break; // TODO: Not implemented
1284     }
1285     // Restriction
1286       bool isStrided;
1287       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1288     if (!isStrided) {
1289       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1290       CeedChk(ierr);
1291       code << "  const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1292       CeedInt compstride;
1293       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1294       code << "  // CompStride: "<<compstride<<"\n";
1295       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1296       data->indices.out[i] = restr_data->d_ind;
1297       code << "  writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1298     } else {
1299       bool backendstrides;
1300       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1301       CeedChk(ierr);
1302       CeedInt strides[3] = {1, elemsize, elemsize*ncomp};
1303       if (!backendstrides) {
1304         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1305         CeedChk(ierr);
1306       }
1307       code << "  // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1308       code << "  writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1309     }
1310   }
1311 
1312   code << "  }\n";
1313   code << "}\n\n";
1314 
1315   // View kernel for debugging
1316   CeedDebug(code.str().c_str());
1317 
1318   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0);
1319   CeedChk(ierr);
1320   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1321   CeedChk(ierr);
1322 
1323   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1324   return 0;
1325 }
1326 //------------------------------------------------------------------------------
1327