xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision ccedf6b0eedde5bdb26f1d628a64390ea2c01243)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 #include <ceed-backend.h>
17 #include "ceed-cuda-gen.h"
18 #include <iostream>
19 #include <sstream>
20 #include "../cuda/ceed-cuda.h"
21 #include "../cuda-reg/ceed-cuda-reg.h"
22 #include "../cuda-shared/ceed-cuda-shared.h"
23 
24 static const char *atomicAdd = QUOTE(
25 __device__ double atomicAdd(double *address, double val) {
26   unsigned long long int *address_as_ull = (unsigned long long int *)address;
27   unsigned long long int old = *address_as_ull, assumed;
28   do {
29     assumed = old;
30     old =
31       atomicCAS(address_as_ull, assumed,
32                 __double_as_longlong(val +
33                                      __longlong_as_double(assumed)));
34     // Note: uses integer comparison to avoid hang in case of NaN
35     // (since NaN != NaN)
36   } while (assumed != old);
37   return __longlong_as_double(old);
38 }
39 );
40 
41 static const char *deviceFunctions = QUOTE(
42 
43 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
44 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
45 
46 typedef struct {
47   CeedInt tidx;
48   CeedInt tidy;
49   CeedInt tidz;
50   CeedInt tid;
51   CeedScalar* slice;
52 } BackendData;
53 
54 template <int P, int Q>
55 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
56   for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
57     B[i] = d_B[i];
58   }
59 }
60 
61 //****
62 // 1D
63 template <int NCOMP, int P1d>
64 inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
65   if (data.tidx<P1d)
66   {
67     const CeedInt node = data.tidx;
68     const CeedInt ind = indices[node + elem * P1d];
69     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
70       r_u[comp] = d_u[ind + nnodes * comp];
71     }
72   }
73 }
74 
75 template <int NCOMP, int P1d>
76 inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
77   if (data.tidx<P1d)
78   {
79     const CeedInt node = data.tidx;
80     const CeedInt ind = indices[node + elem * P1d];
81     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
82       r_u[comp] = d_u[ind * NCOMP + comp];
83     }
84   }
85 }
86 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
87   if (data.tidx<P1d)
88   {
89     const CeedInt node = data.tidx;
90     const CeedInt ind = node * strides[0] + elem * strides[2];
91     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
92       r_u[comp] = d_u[ind + comp * strides[1]];
93     }
94   }
95 }
96 
97 template <int NCOMP, int Q1d>
98 inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
99   const CeedInt node = data.tidx;
100   const CeedInt ind = node + elem * Q1d;
101   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
102     r_u[comp] = d_u[ind + nquads * comp];
103   }
104 }
105 
106 template <int NCOMP, int Q1d>
107 inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
108   const CeedInt node = data.tidx;
109   const CeedInt ind = node + elem * Q1d;
110   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
111     r_u[comp] = d_u[ind * NCOMP + comp];
112   }
113 }
114 
115 template <int NCOMP, int P1d>
116 inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
117   if (data.tidx<P1d)
118   {
119     const CeedInt node = data.tidx;
120     const CeedInt ind = indices[node + elem * P1d];
121     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
122       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
123     }
124   }
125 }
126 
127 template <int NCOMP, int P1d>
128 inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
129   if (data.tidx<P1d)
130   {
131     const CeedInt node = data.tidx;
132     const CeedInt ind = indices[node + elem * P1d];
133     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
134       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
135     }
136   }
137 }
138 
139 template <int NCOMP, int P1d>
140 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
141   if (data.tidx<P1d)
142   {
143     const CeedInt node = data.tidx;
144     const CeedInt ind = node * strides[0] + elem * strides[2];
145     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
146       atomicAdd(&d_v[ind + comp * strides[1]], r_v[comp]);
147     }
148   }
149 }
150 
151 template <int NCOMP, int Q1d>
152 inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
153   const CeedInt node = data.tidx;
154   const CeedInt ind = node + elem * Q1d;
155   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
156     d_v[ind + nquads * comp] = r_v[comp];
157   }
158 }
159 
160 template <int NCOMP, int Q1d>
161 inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
162   const CeedInt node = data.tidx;
163   const CeedInt ind = node + elem * Q1d;
164   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
165     d_v[ind * NCOMP + comp] = r_v[comp];
166   }
167 }
168 
169 template <int NCOMP, int P1d, int Q1d>
170 inline __device__ void ContractX1d(BackendData& data,
171                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
172   data.slice[data.tidx] = *U;
173   __syncthreads();
174   *V = 0.0;
175   for (CeedInt i = 0; i < P1d; ++i) {
176     *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
177   }
178   __syncthreads();
179 }
180 
181 template <int NCOMP, int P1d, int Q1d>
182 inline __device__ void ContractTransposeX1d(BackendData& data,
183                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
184   data.slice[data.tidx] = *U;
185   __syncthreads();
186   *V = 0.0;
187   for (CeedInt i = 0; i < Q1d; ++i) {
188     *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
189   }
190   __syncthreads();
191 }
192 
193 template <int NCOMP, int P1d, int Q1d>
194 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
195                                 CeedScalar *__restrict__ r_V) {
196   for (CeedInt comp=0; comp<NCOMP; comp++) {
197     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
198   }
199 }
200 
201 template <int NCOMP, int P1d, int Q1d>
202 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
203                                 CeedScalar *__restrict__ r_V) {
204   for (CeedInt comp=0; comp<NCOMP; comp++) {
205     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
206   }
207 }
208 
209 template <int NCOMP, int P1d, int Q1d>
210 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
211                               const CeedScalar *c_B, const CeedScalar *c_G,
212                               CeedScalar *__restrict__ r_V) {
213   for (CeedInt comp=0; comp<NCOMP; comp++) {
214     ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
215   }
216 }
217 
218 template <int NCOMP, int P1d, int Q1d>
219 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
220                               const CeedScalar *c_B, const CeedScalar *c_G,
221                               CeedScalar *__restrict__ r_V) {
222   for (CeedInt comp=0; comp<NCOMP; comp++) {
223     ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
224   }
225 }
226 
227 //****
228 // 2D
229 template <int NCOMP, int P1d>
230 inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
231   if (data.tidx<P1d && data.tidy<P1d)
232   {
233     const CeedInt node = data.tidx + data.tidy*P1d;
234     const CeedInt ind = indices[node + elem * P1d*P1d];
235     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
236       r_u[comp] = d_u[ind + nnodes * comp];
237     }
238   }
239 }
240 
241 template <int NCOMP, int P1d>
242 inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
243   if (data.tidx<P1d && data.tidy<P1d)
244   {
245     const CeedInt node = data.tidx + data.tidy*P1d;
246     const CeedInt ind = indices[node + elem * P1d*P1d];
247     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
248       r_u[comp] = d_u[ind * NCOMP + comp];
249     }
250   }
251 }
252 template <int NCOMP, int P1d>
253 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
254   if (data.tidx<P1d && data.tidy<P1d)
255   {
256     const CeedInt node = data.tidx + data.tidy*P1d;
257     const CeedInt ind = node * strides[0] + elem * strides[2];
258     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
259       r_u[comp] = d_u[ind + comp * strides[1]];
260     }
261   }
262 }
263 
264 template <int NCOMP, int Q1d>
265 inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
266   const CeedInt node = data.tidx + data.tidy*Q1d;
267   const CeedInt ind = node + elem * Q1d*Q1d;
268   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
269     r_u[comp] = d_u[ind + nquads * comp];
270   }
271 }
272 
273 template <int NCOMP, int Q1d>
274 inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
275   const CeedInt node = data.tidx + data.tidy*Q1d;
276   const CeedInt ind = node + elem * Q1d*Q1d;
277   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
278     r_u[comp] = d_u[ind * NCOMP + comp];
279   }
280 }
281 
282 template <int NCOMP, int P1d>
283 inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
284   if (data.tidx<P1d && data.tidy<P1d)
285   {
286     const CeedInt node = data.tidx + data.tidy*P1d;
287     const CeedInt ind = indices[node + elem * P1d*P1d];
288     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
289       atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]);
290     }
291   }
292 }
293 
294 template <int NCOMP, int P1d>
295 inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
296   if (data.tidx<P1d && data.tidy<P1d)
297   {
298     const CeedInt node = data.tidx + data.tidy*P1d;
299     const CeedInt ind = indices[node + elem * P1d*P1d];
300     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
301       atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
302     }
303   }
304 }
305 
306 template <int NCOMP, int P1d>
307 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
308   if (data.tidx<P1d && data.tidy<P1d)
309   {
310     const CeedInt node = data.tidx + data.tidy*P1d;
311     const CeedInt ind = node * strides[0] + elem * strides[2];
312     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
313       atomicAdd(&d_v[ind + comp * strides[1]], r_v[comp]);
314     }
315   }
316 }
317 
318 template <int NCOMP, int Q1d>
319 inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
320   const CeedInt node = data.tidx + data.tidy*Q1d;
321   const CeedInt ind = node + elem * Q1d*Q1d;
322   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
323     d_v[ind + nquads * comp] = r_v[comp];
324   }
325 }
326 
327 template <int NCOMP, int Q1d>
328 inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
329   const CeedInt node = data.tidx + data.tidy*Q1d;
330   const CeedInt ind = node + elem * Q1d*Q1d;
331   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
332     d_v[ind * NCOMP + comp] = r_v[comp];
333   }
334 }
335 
336 template <int NCOMP, int P1d, int Q1d>
337 inline __device__ void ContractX2d(BackendData& data,
338                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
339   data.slice[data.tidx+data.tidy*Q1d] = *U;
340   __syncthreads();
341   *V = 0.0;
342   for (CeedInt i = 0; i < P1d; ++i) {
343     *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
344   }
345   __syncthreads();
346 }
347 
348 template <int NCOMP, int P1d, int Q1d>
349 inline __device__ void ContractY2d(BackendData& data,
350                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
351   data.slice[data.tidx+data.tidy*Q1d] = *U;
352   __syncthreads();
353   *V = 0.0;
354   for (CeedInt i = 0; i < P1d; ++i) {
355     *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
356   }
357   __syncthreads();
358 }
359 
360 template <int NCOMP, int P1d, int Q1d>
361 inline __device__ void ContractYTranspose2d(BackendData& data,
362                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
363   data.slice[data.tidx+data.tidy*Q1d] = *U;
364   __syncthreads();
365   *V = 0.0;
366   if (data.tidy<P1d) {
367     for (CeedInt i = 0; i < Q1d; ++i) {
368       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
369     }
370   }
371   __syncthreads();
372 }
373 
374 template <int NCOMP, int P1d, int Q1d>
375 inline __device__ void ContractXTranspose2d(BackendData& data,
376                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
377   data.slice[data.tidx+data.tidy*Q1d] = *U;
378   __syncthreads();
379   *V = 0.0;
380   if (data.tidx<P1d) {
381     for (CeedInt i = 0; i < Q1d; ++i) {
382       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
383     }
384   }
385   __syncthreads();
386 }
387 
388 template <int NCOMP, int P1d, int Q1d>
389 inline __device__ void ContractXTransposeAdd2d(BackendData& data,
390                                             const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
391   data.slice[data.tidx+data.tidy*Q1d] = *U;
392   __syncthreads();
393   if (data.tidx<P1d) {
394     for (CeedInt i = 0; i < Q1d; ++i) {
395       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
396     }
397   }
398   __syncthreads();
399 }
400 
401 template <int NCOMP, int P1d, int Q1d>
402 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
403                                 CeedScalar *__restrict__ r_V) {
404   CeedScalar r_t[1];
405   for (CeedInt comp=0; comp<NCOMP; comp++) {
406     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
407     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
408   }
409 }
410 
411 template <int NCOMP, int P1d, int Q1d>
412 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
413                                 CeedScalar *__restrict__ r_V) {
414   CeedScalar r_t[1];
415   for (CeedInt comp=0; comp<NCOMP; comp++) {
416     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
417     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
418   }
419 }
420 
421 template <int NCOMP, int P1d, int Q1d>
422 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
423                               const CeedScalar *c_B, const CeedScalar *c_G,
424                               CeedScalar *__restrict__ r_V) {
425   CeedScalar r_t[1];
426   for (CeedInt comp=0; comp<NCOMP; comp++) {
427     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
428     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
429     ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
430     ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
431   }
432 }
433 
434 template <int NCOMP, int P1d, int Q1d>
435 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
436                               const CeedScalar *c_B, const CeedScalar *c_G,
437                               CeedScalar *__restrict__ r_V) {
438   CeedScalar r_t[1];
439   for (CeedInt comp=0; comp<NCOMP; comp++) {
440     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
441     ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
442     ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
443     ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
444   }
445 }
446 
447 //****
448 // 3D
449 template <int NCOMP, int P1d>
450 inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
451   if (data.tidx<P1d && data.tidy<P1d) {
452     for (CeedInt z = 0; z < P1d; ++z) {
453       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
454       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
455       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
456         r_u[z+comp*P1d] = d_u[ind + nnodes * comp];
457       }
458     }
459   }
460 }
461 
462 template <int NCOMP, int P1d>
463 inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
464   if (data.tidx<P1d && data.tidy<P1d) {
465     for (CeedInt z = 0; z < P1d; ++z) {
466       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
467       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
468       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
469         r_u[z+comp*P1d] = d_u[ind * NCOMP + comp];
470       }
471     }
472   }
473 }
474 template <int NCOMP, int P1d>
475 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
476   if (data.tidx<P1d && data.tidy<P1d) {
477     for (CeedInt z = 0; z < P1d; ++z) {
478       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
479       const CeedInt ind = node * strides[0] + elem * strides[2];
480       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
481         r_u[z+comp*P1d] = d_u[ind + comp * strides[1]];
482       }
483     }
484   }
485 }
486 
487 template <int NCOMP, int Q1d>
488 inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
489   for (CeedInt z=0; z < Q1d; ++z) {
490     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
491     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
492     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
493       r_u[z+comp*Q1d] = d_u[ind + nquads * comp];
494     }
495   }
496 }
497 
498 template <int NCOMP, int Q1d>
499 inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
500   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
501   const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
502   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
503     r_u[comp] = d_u[ind + nquads * comp];
504   }
505 }
506 
507 template <int NCOMP, int Q1d>
508 inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
509   for (CeedInt z=0; z < Q1d; ++z) {
510     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
511     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
512     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
513       r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp];
514     }
515   }
516 }
517 
518 template <int NCOMP, int Q1d>
519 inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
520   const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
521   const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
522   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
523     r_u[comp] = d_u[ind * NCOMP + comp];
524   }
525 }
526 
527 template <int NCOMP, int P1d>
528 inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
529   if (data.tidx<P1d && data.tidy<P1d) {
530     for (CeedInt z = 0; z < P1d; ++z) {
531       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
532       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
533       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
534         atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]);
535       }
536     }
537   }
538 }
539 
540 template <int NCOMP, int P1d>
541 inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
542   if (data.tidx<P1d && data.tidy<P1d) {
543     for (CeedInt z = 0; z < P1d; ++z) {
544       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
545       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
546       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
547         atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]);
548       }
549     }
550   }
551 }
552 
553 template <int NCOMP, int P1d>
554 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
555   if (data.tidx<P1d && data.tidy<P1d) {
556     for (CeedInt z = 0; z < P1d; ++z) {
557       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
558       const CeedInt ind = node * strides[0] + elem * strides[2];
559       for (CeedInt comp = 0; comp < NCOMP; ++comp) {
560         atomicAdd(&d_v[ind + comp * strides[1]], r_v[z+comp*P1d]);
561       }
562     }
563   }
564 }
565 
566 template <int NCOMP, int Q1d>
567 inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
568   for (CeedInt z=0; z < Q1d; ++z) {
569     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
570     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
571     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
572       d_v[ind + nquads * comp] = r_v[z+comp*Q1d];
573     }
574   }
575 }
576 
577 template <int NCOMP, int Q1d>
578 inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
579   for (CeedInt z=0; z < Q1d; ++z) {
580     const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
581     const CeedInt ind = node + elem * Q1d*Q1d*Q1d;
582     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
583       d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d];
584     }
585   }
586 }
587 
588 template <int NCOMP, int P1d, int Q1d>
589 inline __device__ void ContractX3d(BackendData& data,
590                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
591   CeedScalar r_B[P1d];
592   for (CeedInt i = 0; i < P1d; ++i) {
593     r_B[i] = B[i + data.tidx*P1d];
594   }
595   for (CeedInt k = 0; k < P1d; ++k) {
596     data.slice[data.tidx+data.tidy*Q1d] = U[k];
597     __syncthreads();
598     V[k] = 0.0;
599     for (CeedInt i = 0; i < P1d; ++i) {
600       V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
601     }
602     __syncthreads();
603   }
604 }
605 
606 template <int NCOMP, int P1d, int Q1d>
607 inline __device__ void ContractY3d(BackendData& data,
608                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
609   CeedScalar r_B[P1d];
610   for (CeedInt i = 0; i < P1d; ++i) {
611     r_B[i] = B[i + data.tidy*P1d];
612   }
613   for (CeedInt k = 0; k < P1d; ++k) {
614     data.slice[data.tidx+data.tidy*Q1d] = U[k];
615     __syncthreads();
616     V[k] = 0.0;
617     for (CeedInt i = 0; i < P1d; ++i) {
618       V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
619     }
620     __syncthreads();
621   }
622 }
623 
624 template <int NCOMP, int P1d, int Q1d>
625 inline __device__ void ContractZ3d(BackendData& data,
626                                    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
627   for (CeedInt k = 0; k < Q1d; ++k) {
628     V[k] = 0.0;
629     for (CeedInt i = 0; i < P1d; ++i) {
630       V[k] += B[i + k*P1d] * U[i];//contract z direction
631     }
632   }
633 }
634 
635 template <int NCOMP, int P1d, int Q1d>
636 inline __device__ void ContractTransposeZ3d(BackendData& data,
637     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
638   for (CeedInt k = 0; k < Q1d; ++k) {
639     V[k] = 0.0;
640     if (k<P1d) {
641       for (CeedInt i = 0; i < Q1d; ++i) {
642         V[k] += B[k + i*P1d] * U[i];//contract z direction
643       }
644     }
645   }
646 }
647 
648 template <int NCOMP, int P1d, int Q1d>
649 inline __device__ void ContractTransposeY3d(BackendData& data,
650     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
651   CeedScalar r_B[Q1d];
652   for (CeedInt i = 0; i < Q1d; ++i)
653   {
654     r_B[i] = B[data.tidy + i*P1d];
655   }
656   for (CeedInt k = 0; k < P1d; ++k) {
657     data.slice[data.tidx+data.tidy*Q1d] = U[k];
658     __syncthreads();
659     V[k] = 0.0;
660     if (data.tidy<P1d) {
661       for (CeedInt i = 0; i < Q1d; ++i) {
662         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
663       }
664     }
665     __syncthreads();
666   }
667 }
668 
669 template <int NCOMP, int P1d, int Q1d>
670 inline __device__ void ContractTransposeAddY3d(BackendData& data,
671     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
672   CeedScalar r_B[Q1d];
673   for (CeedInt i = 0; i < Q1d; ++i) {
674     r_B[i] = B[data.tidy + i*P1d];
675   }
676   for (CeedInt k = 0; k < P1d; ++k) {
677     data.slice[data.tidx+data.tidy*Q1d] = U[k];
678     __syncthreads();
679     if (data.tidy<P1d) {
680       for (CeedInt i = 0; i < Q1d; ++i) {
681         V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction
682       }
683     }
684     __syncthreads();
685   }
686 }
687 
688 template <int NCOMP, int P1d, int Q1d>
689 inline __device__ void ContractTransposeX3d(BackendData& data,
690     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
691   CeedScalar r_B[Q1d];
692   for (CeedInt i = 0; i < Q1d; ++i) {
693     r_B[i] = B[data.tidx + i*P1d];
694   }
695   for (CeedInt k = 0; k < P1d; ++k) {
696     data.slice[data.tidx+data.tidy*Q1d] = U[k];
697     __syncthreads();
698     V[k] = 0.0;
699     if (data.tidx<P1d) {
700       for (CeedInt i = 0; i < Q1d; ++i) {
701         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
702       }
703     }
704     __syncthreads();
705   }
706 }
707 
708 template <int NCOMP, int P1d, int Q1d>
709 inline __device__ void ContractTransposeAddX3d(BackendData& data,
710     const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
711   CeedScalar r_B[Q1d];
712   for (CeedInt i = 0; i < Q1d; ++i) {
713     r_B[i] = B[data.tidx + i*P1d];
714   }
715   for (CeedInt k = 0; k < P1d; ++k) {
716     data.slice[data.tidx+data.tidy*Q1d] = U[k];
717     __syncthreads();
718     if (data.tidx<P1d) {
719       for (CeedInt i = 0; i < Q1d; ++i) {
720         V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction
721       }
722     }
723     __syncthreads();
724   }
725 }
726 
727 template <int NCOMP, int P1d, int Q1d>
728 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
729                                 CeedScalar *__restrict__ r_V) {
730   CeedScalar r_t1[Q1d];
731   CeedScalar r_t2[Q1d];
732   for (CeedInt comp=0; comp<NCOMP; comp++) {
733     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
734     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
735     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
736   }
737 }
738 
739 template <int NCOMP, int P1d, int Q1d>
740 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
741                                 CeedScalar *__restrict__ r_V) {
742   CeedScalar r_t1[Q1d];
743   CeedScalar r_t2[Q1d];
744   for (CeedInt comp=0; comp<NCOMP; comp++) {
745     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
746     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
747     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
748   }
749 }
750 
751 template <int NCOMP, int P1d, int Q1d>
752 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
753                               const CeedScalar *c_B, const CeedScalar *c_G,
754                               CeedScalar *__restrict__ r_V) {
755   CeedScalar r_t1[Q1d];
756   CeedScalar r_t2[Q1d];
757   for (CeedInt comp=0; comp<NCOMP; comp++) {
758     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
759     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
760     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
761     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
762     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
763     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
764     ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
765     ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
766     ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
767   }
768 }
769 
770 template <int NCOMP, int P1d, int Q1d>
771 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
772                                        const CeedScalar *c_B, const CeedScalar *c_G,
773                                        CeedScalar *__restrict__ r_V) {
774   CeedScalar r_t1[Q1d];
775   CeedScalar r_t2[Q1d];
776   for (CeedInt comp=0; comp<NCOMP; comp++) {
777     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
778     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
779     ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
780     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
781     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
782     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
783     ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
784     ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
785     ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
786   }
787 }
788 
789 template <int NCOMP, int Q1d>
790 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q,
791                                    const CeedScalar *__restrict__ r_U,
792                                    const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
793   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
794     data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d];
795     __syncthreads();
796     // X derivative
797     r_V[comp+0*NCOMP] = 0.0;
798     for (CeedInt i = 0; i < Q1d; ++i) {
799       r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
800     }
801     // Y derivative
802     r_V[comp+1*NCOMP] = 0.0;
803     for (CeedInt i = 0; i < Q1d; ++i) {
804       r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
805     }
806     // Z derivative
807     r_V[comp+2*NCOMP] = 0.0;
808     for (CeedInt i = 0; i < Q1d; ++i) {
809       r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative)
810     }
811     __syncthreads();
812   }
813 }
814 
815 template <int NCOMP, int Q1d>
816 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q,
817                                             const CeedScalar *__restrict__ r_U,
818                                             const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
819   for (CeedInt comp = 0; comp < NCOMP; ++comp) {
820     // X derivative
821     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP];
822     __syncthreads();
823     for (CeedInt i = 0; i < Q1d; ++i) {
824       r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative)
825     }
826     __syncthreads();
827     // Y derivative
828     data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP];
829     __syncthreads();
830     for (CeedInt i = 0; i < Q1d; ++i) {
831       r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative)
832     }
833     __syncthreads();
834     // Z derivative
835     for (CeedInt i = 0; i < Q1d; ++i) {
836       r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative)
837     }
838   }
839 }
840 
841 template <int Q1d>
842 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
843   *w = qweight1d[data.tidx];
844 }
845 
846 template <int Q1d>
847 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
848   *w = qweight1d[data.tidx]*qweight1d[data.tidy];
849 }
850 
851 template <int Q1d>
852 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
853   const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
854   for (CeedInt z = 0; z < Q1d; ++z)
855   {
856     w[z] = pw*qweight1d[z];
857   }
858 }
859 
860 );
861 
862 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
863 
864 	using std::ostringstream;
865   using std::string;
866   int ierr;
867   bool setupdone;
868   ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
869   if (setupdone) return 0;
870   Ceed ceed;
871   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
872   CeedOperator_Cuda_gen *data;
873   ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
874   CeedQFunction qf;
875   CeedQFunction_Cuda_gen *qf_data;
876   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
877   ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
878   CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes;
879   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
880   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
881   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
882   CeedChk(ierr);
883   CeedOperatorField *opinputfields, *opoutputfields;
884   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
885   CeedChk(ierr);
886   CeedQFunctionField *qfinputfields, *qfoutputfields;
887   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
888   CeedChk(ierr);
889   CeedEvalMode emode;
890   CeedInterlaceMode imode;
891   CeedBasis basis;
892   CeedBasis_Cuda_shared *basis_data;
893   CeedElemRestriction Erestrict;
894   CeedElemRestriction_Cuda_reg *restr_data;
895 
896   ostringstream code;
897   string devFunctions(deviceFunctions);
898 
899   // Add atomicAdd function for old NVidia architectures
900   struct cudaDeviceProp prop;
901   Ceed delegate;
902   CeedGetDelegate(ceed, &delegate);
903   Ceed_Cuda *ceed_data;
904   ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr);
905   ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId);
906   if (prop.major<6){
907     code << atomicAdd;
908   }
909 
910   code << devFunctions;
911 
912   string qFunction(qf_data->qFunctionSource);
913 
914   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
915   code << "\n#define CeedPragmaSIMD\n";
916   code << qFunction;
917 
918   // Setup
919   code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
920   // Input Evecs and Restriction
921   for (CeedInt i = 0; i < numinputfields; i++) {
922     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
923     CeedChk(ierr);
924     if (emode == CEED_EVAL_WEIGHT) { // Skip
925     } else {
926       code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
927       if (emode != CEED_EVAL_NONE)
928       {
929         ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
930         bool isTensor;
931         ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
932         //TODO check that all are the same
933         ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
934         if (isTensor)
935         {
936           //TODO check that all are the same
937           ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
938         } else {
939           return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
940         }
941       }
942     }
943   }
944   data->dim = dim;
945   data->Q1d = Q1d;
946 
947   for (CeedInt i = 0; i < numoutputfields; i++) {
948     code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
949   }
950   code << "const CeedInt Dim = "<<dim<<";\n";
951   code << "const CeedInt Q1d = "<<Q1d<<";\n";
952   // code << "const CeedInt Q   = "<<Q<<";\n";
953   code << "extern __shared__ CeedScalar slice[];\n";
954   code << "BackendData data;\n";
955   code << "data.tidx = threadIdx.x;\n";
956   code << "data.tidy = threadIdx.y;\n";
957   code << "data.tidz = threadIdx.z;\n";
958   code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
959   code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
960   //Initialize constants, and matrices B and G
961   for (CeedInt i = 0; i < numinputfields; i++) {
962     code << "// Input field "<<i<<"\n";
963     // Get elemsize, emode, ncomp
964     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
965     CeedChk(ierr);
966     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
967     CeedChk(ierr);
968     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
969     CeedChk(ierr);
970     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
971     CeedChk(ierr);
972     // Basis action
973     switch (emode) {
974     case CEED_EVAL_NONE:
975       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
976       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
977       code << "  const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n";
978       break;
979     case CEED_EVAL_INTERP:
980       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
981       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
982       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
983       code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
984       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
985       code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
986       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
987       data->B.in[i] = basis_data->d_interp1d;
988       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
989       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
990       break;
991     case CEED_EVAL_GRAD:
992       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
993       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
994       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
995       code << "  const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n";
996       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
997       code << "const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
998       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
999       data->B.in[i] = basis_data->d_interp1d;
1000       code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
1001       code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
1002       if (basis_data->d_collograd1d) {
1003         data->G.in[i] = basis_data->d_collograd1d;
1004         code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
1005         code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
1006       } else {
1007         data->G.in[i] = basis_data->d_grad1d;
1008         code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
1009         code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
1010       }
1011       break;
1012     case CEED_EVAL_WEIGHT:
1013       break; // No action
1014     case CEED_EVAL_DIV:
1015       break; // TODO: Not implemented
1016     case CEED_EVAL_CURL:
1017       break; // TODO: Not implemented
1018     }
1019   }
1020   for (CeedInt i = 0; i < numoutputfields; i++) {
1021     code << "// Output field "<<i<<"\n";
1022     // Get elemsize, emode, ncomp
1023     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1024     CeedChk(ierr);
1025     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1026     CeedChk(ierr);
1027     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1028     CeedChk(ierr);
1029     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1030     CeedChk(ierr);
1031     // Basis action
1032     switch (emode) {
1033     case CEED_EVAL_NONE:
1034       code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
1035       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
1036       code << "  const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n";
1037       break; // No action
1038     case CEED_EVAL_INTERP:
1039       code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
1040       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
1041       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
1042       code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
1043       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1044       data->B.out[i] = basis_data->d_interp1d;
1045       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1046       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1047       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
1048       code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
1049       break;
1050     case CEED_EVAL_GRAD:
1051       code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
1052       ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
1053       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
1054       code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
1055       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1056       data->B.out[i] = basis_data->d_interp1d;
1057       code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1058       code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1059       if (basis_data->d_collograd1d) {
1060         data->G.out[i] = basis_data->d_collograd1d;
1061         code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1062         code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1063       } else {
1064         data->G.out[i] = basis_data->d_grad1d;
1065         code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1066         code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1067       }
1068       ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr);
1069       code << "  const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n";
1070       break;
1071     case CEED_EVAL_WEIGHT: {
1072       Ceed ceed;
1073       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1074       return CeedError(ceed, 1,
1075                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1076       break; // Should not occur
1077     }
1078     case CEED_EVAL_DIV:
1079       break; // TODO: Not implemented
1080     case CEED_EVAL_CURL:
1081       break; // TODO: Not implemented
1082     }
1083   }
1084   code << "\n";
1085   code << "__syncthreads();\n";
1086   code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1087   // Input basis apply if needed
1088   // Generate the correct eval mode code for each input
1089   for (CeedInt i = 0; i < numinputfields; i++) {
1090     code << "  // Input field "<<i<<"\n";
1091     // Get elemsize, emode, ncomp
1092     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1093     CeedChk(ierr);
1094     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1095     CeedChk(ierr);
1096     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1097     CeedChk(ierr);
1098     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1099     CeedChk(ierr);
1100     // Basis action
1101     switch (emode) {
1102     case CEED_EVAL_NONE:
1103       if (!basis_data->d_collograd1d) {
1104         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1105         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1106         code << "  readQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n";
1107       }
1108       break;
1109     case CEED_EVAL_INTERP:
1110       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1111       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1112       data->indices.in[i] = restr_data->d_ind;
1113       if (data->indicies.in[i]) {
1114         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1115       } else {
1116         ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.in[i])); CeedChk(ierr);
1117       }
1118       code << "  readDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, strides.in["<<i<<"], indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1119       code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1120       code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1121       break;
1122     case CEED_EVAL_GRAD:
1123       code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1124       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1125       data->indices.in[i] = restr_data->d_ind;
1126       if (data->indicies.in[i]) {
1127         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1128       } else {
1129         ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.in[i])); CeedChk(ierr);
1130       }
1131       code << "  readDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, strides.in["<<i<<"], indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1132       if (basis_data->d_collograd1d) {
1133         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1134         code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1135       } else {
1136         code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1137         code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1138       }
1139       break;
1140     case CEED_EVAL_WEIGHT:
1141       code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
1142       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1143       ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
1144       data->W = basis_data->d_qweight1d;
1145       code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1146       break; // No action
1147     case CEED_EVAL_DIV:
1148       break; // TODO: Not implemented
1149     case CEED_EVAL_CURL:
1150       break; // TODO: Not implemented
1151     }
1152   }
1153 
1154   // Q function
1155   for (CeedInt i = 0; i < numoutputfields; i++) {
1156       code << "  // Output field "<<i<<"\n";
1157     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1158     CeedChk(ierr);
1159     if (emode==CEED_EVAL_GRAD)
1160     {
1161       if (basis_data->d_collograd1d) {
1162         //Accumulator for gradient slices
1163         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1164         code << "  for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1165         code << "    for (CeedInt j = 0; j < Q1d; ++j) {\n";
1166         code << "      r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1167         code << "    }\n";
1168         code << "  }\n";
1169       } else {
1170         code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1171       }
1172     }
1173     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1174     {
1175       code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1176     }
1177   }
1178   //We treat quadrature points per slice in 3d to save registers
1179   code << "// QFunction\n";
1180   if (basis_data->d_collograd1d) {
1181     code << "#pragma unroll\n";
1182     code << "for (CeedInt q=0; q<Q1d; q++) {\n";
1183     for (CeedInt i = 0; i < numinputfields; i++) {
1184       code << "  // Input field "<<i<<"\n";
1185       // Get elemsize, emode, ncomp
1186       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1187       CeedChk(ierr);
1188       // Basis action
1189       switch (emode) {
1190       case CEED_EVAL_NONE:
1191         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1192         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1193         code << "  readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1194         break;
1195       case CEED_EVAL_INTERP:
1196         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1197         code << "  for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1198         code << "    r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1199         code << "  }\n";
1200         break;
1201       case CEED_EVAL_GRAD:
1202         code << "  CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1203         code << "  gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1204         break;
1205       case CEED_EVAL_WEIGHT:
1206         code << "  CeedScalar r_q"<<i<<"[1];\n";
1207         code << "  r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1208         break; // No action
1209       case CEED_EVAL_DIV:
1210         break; // TODO: Not implemented
1211       case CEED_EVAL_CURL:
1212         break; // TODO: Not implemented
1213       }
1214     }
1215     for (CeedInt i = 0; i < numoutputfields; i++) {
1216       code << "  // Output field "<<i<<"\n";
1217       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1218       CeedChk(ierr);
1219       // Basis action
1220       switch (emode) {
1221       case CEED_EVAL_NONE:
1222         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1223         break; // No action
1224       case CEED_EVAL_INTERP:
1225         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1226         break;
1227       case CEED_EVAL_GRAD:
1228         code << "  CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1229         break;
1230       case CEED_EVAL_WEIGHT:
1231         break; // Should not occur
1232       case CEED_EVAL_DIV:
1233         break; // TODO: Not implemented
1234       case CEED_EVAL_CURL:
1235         break; // TODO: Not implemented
1236       }
1237     }
1238   } else {
1239     for (CeedInt i = 0; i < numinputfields; i++) {
1240       code << "  CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1241     }
1242     for (CeedInt i = 0; i < numoutputfields; i++) {
1243       code << "  CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1244     }
1245   }
1246   code << "  CeedScalar* in["<<numinputfields<<"];\n";
1247   for (CeedInt i = 0; i < numinputfields; i++) {
1248     code << "  in["<<i<<"] = r_q"<<i<<";\n";
1249   }
1250   code << "  CeedScalar* out["<<numoutputfields<<"];\n";
1251   for (CeedInt i = 0; i < numoutputfields; i++) {
1252     code << "  out["<<i<<"] = r_qq"<<i<<";\n";
1253   }
1254   string qFunctionName(qf_data->qFunctionName);
1255   code << "  "<<qFunctionName<<"(ctx, ";
1256   if (dim != 3 || basis_data->d_collograd1d) {
1257     code << "1 ";
1258   }else{
1259     code << "Q1d ";
1260   }
1261   code << ", in, out);\n";
1262   if (basis_data->d_collograd1d) {
1263     for (CeedInt i = 0; i < numoutputfields; i++) {
1264       code << "  // Output field "<<i<<"\n";
1265       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1266       CeedChk(ierr);
1267       // Basis action
1268       switch (emode) {
1269       case CEED_EVAL_NONE:
1270         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1271         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1272         code << "  }\n";
1273         break; // No action
1274       case CEED_EVAL_INTERP:
1275         code << "  for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1276         code << "    r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1277         code << "  }\n";
1278         break;
1279       case CEED_EVAL_GRAD:
1280         code << "  gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1281         break;
1282       case CEED_EVAL_WEIGHT:
1283         break; // Should not occur
1284       case CEED_EVAL_DIV:
1285         break; // TODO: Not implemented
1286       case CEED_EVAL_CURL:
1287         break; // TODO: Not implemented
1288       }
1289     }
1290     code << "}\n";
1291   }
1292 
1293   // Output basis apply if needed
1294   // Generate the correct eval mode code for each output
1295   for (CeedInt i = 0; i < numoutputfields; i++) {
1296     code << "  // Output field "<<i<<"\n";
1297     // Get elemsize, emode, ncomp
1298     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1299     CeedChk(ierr);
1300     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1301     CeedChk(ierr);
1302     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1303     CeedChk(ierr);
1304     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1305     CeedChk(ierr);
1306     // Basis action
1307     switch (emode) {
1308     case CEED_EVAL_NONE:
1309       ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1310       code << "  writeQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n";
1311       break; // No action
1312     case CEED_EVAL_INTERP:
1313       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1314       code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1315       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1316       data->indices.out[i] = restr_data->d_ind;
1317       if (data->indicies.out[i]) {
1318         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1319       } else {
1320         ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.out[i])); CeedChk(ierr);
1321       }
1322       code << "  writeDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, strides.out["<<i<<"], indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1323       break;
1324     case CEED_EVAL_GRAD:
1325       code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1326       if (basis_data->d_collograd1d) {
1327         code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1328       } else {
1329         code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1330       }
1331       ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
1332       data->indices.out[i] = restr_data->d_ind;
1333       if (data->indicies.out[i]) {
1334         ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr);
1335       } else {
1336         ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.out[i])); CeedChk(ierr);
1337       }
1338       code << "  writeDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, strides.out["<<i<<"], indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1339       break;
1340     case CEED_EVAL_WEIGHT: {
1341       Ceed ceed;
1342       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1343       return CeedError(ceed, 1,
1344                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1345       break; // Should not occur
1346     }
1347     case CEED_EVAL_DIV:
1348       break; // TODO: Not implemented
1349     case CEED_EVAL_CURL:
1350       break; // TODO: Not implemented
1351     }
1352   }
1353 
1354   code << "  }\n";
1355   code << "}\n\n";
1356 
1357   // std::cout << code.str();
1358 
1359   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr);
1360   ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
1361   CeedChk(ierr);
1362 
1363   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1364 
1365   return 0;
1366 }
1367