xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision d92fedf5b7546cf2fc50391dbcfb657a2e1f0a3b)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #define CEED_DEBUG_COLOR 12
18 
19 #include <ceed/ceed.h>
20 #include <ceed/backend.h>
21 #include <cuda_runtime.h>
22 #include <iostream>
23 #include <sstream>
24 #include "ceed-cuda-gen.h"
25 #include "../cuda/ceed-cuda-compile.h"
26 #include "../cuda-ref/ceed-cuda-ref.h"
27 #include "../cuda-shared/ceed-cuda-shared.h"
28 
29 static const char *atomicAdd = QUOTE(
30 //------------------------------------------------------------------------------
31 // Atomic add, for older CUDA
32 //------------------------------------------------------------------------------
33 __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) {
34   unsigned long long int *address_as_ull = (unsigned long long int *)address;
35   unsigned long long int old = *address_as_ull, assumed;
36   do {
37     assumed = old;
38     old =
39       atomicCAS(address_as_ull, assumed,
40                 __double_as_longlong(val +
41                                     __longlong_as_double(assumed)));
42     // Note: uses integer comparison to avoid hang in case of NaN
43     // (since NaN != NaN)
44   } while (assumed != old);
45   return __longlong_as_double(old);
46 }
47 );
48 
49 static const char *deviceFunctions = QUOTE(
50 
51 //------------------------------------------------------------------------------
52 // Typedefs
53 //------------------------------------------------------------------------------
54 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
55 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
56 
57 typedef struct {
58   CeedInt tidx;
59   CeedInt tidy;
60   CeedInt tidz;
61   CeedInt tid;
62   CeedScalar* slice;
63 } BackendData;
64 
65 //------------------------------------------------------------------------------
66 // Load matrices for basis actions
67 //------------------------------------------------------------------------------
68 template <int P, int Q>
69 inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
70   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
71     B[i] = d_B[i];
72 }
73 
74 //------------------------------------------------------------------------------
75 // 1D
76 //------------------------------------------------------------------------------
77 
78 //------------------------------------------------------------------------------
79 // L-vector -> E-vector, offsets provided
80 //------------------------------------------------------------------------------
81 template <int NCOMP, int COMPSTRIDE, int P1d>
82 inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
83   if (data.tidx < P1d) {
84     const CeedInt node = data.tidx;
85     const CeedInt ind = indices[node + elem * P1d];
86     for (CeedInt comp = 0; comp < NCOMP; ++comp)
87       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // L-vector -> E-vector, strided
93 //------------------------------------------------------------------------------
94 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
95 inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
96   if (data.tidx < P1d) {
97     const CeedInt node = data.tidx;
98     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
99     for (CeedInt comp = 0; comp < NCOMP; ++comp)
100       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
101   }
102 }
103 
104 //------------------------------------------------------------------------------
105 // E-vector -> L-vector, offsets provided
106 //------------------------------------------------------------------------------
107 template <int NCOMP, int COMPSTRIDE, int P1d>
108 inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
109   if (data.tidx < P1d) {
110     const CeedInt node = data.tidx;
111     const CeedInt ind = indices[node + elem * P1d];
112     for (CeedInt comp = 0; comp < NCOMP; ++comp)
113       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
114   }
115 }
116 
117 //------------------------------------------------------------------------------
118 // E-vector -> L-vector, strided
119 //------------------------------------------------------------------------------
120 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
121 inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
122   if (data.tidx < P1d) {
123     const CeedInt node = data.tidx;
124     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
125     for (CeedInt comp = 0; comp < NCOMP; ++comp)
126       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
127   }
128 }
129 
130 //------------------------------------------------------------------------------
131 // 1D tensor contraction x
132 //------------------------------------------------------------------------------
133 template <int NCOMP, int P1d, int Q1d>
134 inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
135   data.slice[data.tidx] = *U;
136   __syncthreads();
137   *V = 0.0;
138   if (data.tidx < Q1d)
139     for (CeedInt i = 0; i < P1d; ++i)
140       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
141   __syncthreads();
142 }
143 
144 //------------------------------------------------------------------------------
145 // 1D transpose tensor contraction x
146 //------------------------------------------------------------------------------
147 template <int NCOMP, int P1d, int Q1d>
148 inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
149   data.slice[data.tidx] = *U;
150   __syncthreads();
151   *V = 0.0;
152   if (data.tidx < P1d)
153     for (CeedInt i = 0; i < Q1d; ++i)
154       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
155   __syncthreads();
156 }
157 
158 //------------------------------------------------------------------------------
159 // 1D interpolate to quadrature points
160 //------------------------------------------------------------------------------
161 template <int NCOMP, int P1d, int Q1d>
162 inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
163   for (CeedInt comp = 0; comp < NCOMP; comp++)
164     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
165 }
166 
167 //------------------------------------------------------------------------------
168 // 1D interpolate transpose
169 //------------------------------------------------------------------------------
170 template <int NCOMP, int P1d, int Q1d>
171 inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
172   for (CeedInt comp=0; comp<NCOMP; comp++)
173     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
174 }
175 
176 //------------------------------------------------------------------------------
177 // 1D derivatives at quadrature points
178 //------------------------------------------------------------------------------
179 template <int NCOMP, int P1d, int Q1d>
180 inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
181   for (CeedInt comp = 0; comp < NCOMP; comp++)
182     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
183 }
184 
185 //------------------------------------------------------------------------------
186 // 1D derivatives transpose
187 //------------------------------------------------------------------------------
188 template <int NCOMP, int P1d, int Q1d>
189 inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
190   for (CeedInt comp = 0; comp < NCOMP; comp++)
191     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
192 }
193 
194 //------------------------------------------------------------------------------
195 // 2D
196 //------------------------------------------------------------------------------
197 
198 //------------------------------------------------------------------------------
199 // L-vector -> E-vector, offsets provided
200 //------------------------------------------------------------------------------
201 template <int NCOMP, int COMPSTRIDE, int P1d>
202 inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
203   if (data.tidx < P1d && data.tidy < P1d) {
204     const CeedInt node = data.tidx + data.tidy*P1d;
205     const CeedInt ind = indices[node + elem * P1d*P1d];
206     for (CeedInt comp = 0; comp < NCOMP; ++comp)
207       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
208   }
209 }
210 
211 //------------------------------------------------------------------------------
212 // L-vector -> E-vector, strided
213 //------------------------------------------------------------------------------
214 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
215 inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
216   if (data.tidx < P1d && data.tidy < P1d) {
217     const CeedInt node = data.tidx + data.tidy*P1d;
218     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
219     for (CeedInt comp = 0; comp < NCOMP; ++comp)
220       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
221   }
222 }
223 
224 //------------------------------------------------------------------------------
225 // E-vector -> L-vector, offsets provided
226 //------------------------------------------------------------------------------
227 template <int NCOMP, int COMPSTRIDE, int P1d>
228 inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
229   if (data.tidx < P1d && data.tidy < P1d) {
230     const CeedInt node = data.tidx + data.tidy*P1d;
231     const CeedInt ind = indices[node + elem * P1d*P1d];
232     for (CeedInt comp = 0; comp < NCOMP; ++comp)
233       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
234   }
235 }
236 
237 //------------------------------------------------------------------------------
238 // E-vector -> L-vector, strided
239 //------------------------------------------------------------------------------
240 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
241 inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
242   if (data.tidx < P1d && data.tidy < P1d) {
243     const CeedInt node = data.tidx + data.tidy*P1d;
244     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
245     for (CeedInt comp = 0; comp < NCOMP; ++comp)
246       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
247   }
248 }
249 
250 //------------------------------------------------------------------------------
251 // 2D tensor contraction x
252 //------------------------------------------------------------------------------
253 template <int NCOMP, int P1d, int Q1d>
254 inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
255   data.slice[data.tidx+data.tidy*T1d] = *U;
256   __syncthreads();
257   *V = 0.0;
258   if (data.tidx < Q1d && data.tidy < P1d)
259     for (CeedInt i = 0; i < P1d; ++i)
260       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
261   __syncthreads();
262 }
263 
264 //------------------------------------------------------------------------------
265 // 2D tensor contract y
266 //------------------------------------------------------------------------------
267 template <int NCOMP, int P1d, int Q1d>
268 inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
269   data.slice[data.tidx+data.tidy*T1d] = *U;
270   __syncthreads();
271   *V = 0.0;
272   if (data.tidx < Q1d && data.tidy < Q1d)
273     for (CeedInt i = 0; i < P1d; ++i)
274       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
275   __syncthreads();
276 }
277 
278 //------------------------------------------------------------------------------
279 // 2D transpose tensor contract y
280 //------------------------------------------------------------------------------
281 template <int NCOMP, int P1d, int Q1d>
282 inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
283   data.slice[data.tidx+data.tidy*T1d] = *U;
284   __syncthreads();
285   *V = 0.0;
286   if (data.tidx < Q1d && data.tidy < P1d)
287     for (CeedInt i = 0; i < Q1d; ++i)
288       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
289   __syncthreads();
290 }
291 
292 //------------------------------------------------------------------------------
293 // 2D transpose tensor contract x
294 //------------------------------------------------------------------------------
295 template <int NCOMP, int P1d, int Q1d>
296 inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
297   data.slice[data.tidx+data.tidy*T1d] = *U;
298   __syncthreads();
299   *V = 0.0;
300   if (data.tidx < P1d && data.tidy < P1d)
301     for (CeedInt i = 0; i < Q1d; ++i)
302       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
303   __syncthreads();
304 }
305 
306 //------------------------------------------------------------------------------
307 // 2D transpose tensor contract and add x
308 //------------------------------------------------------------------------------
309 template <int NCOMP, int P1d, int Q1d>
310 inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
311   data.slice[data.tidx+data.tidy*T1d] = *U;
312   __syncthreads();
313   if (data.tidx < P1d && data.tidy < P1d)
314     for (CeedInt i = 0; i < Q1d; ++i)
315       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
316   __syncthreads();
317 }
318 
319 //------------------------------------------------------------------------------
320 // 2D interpolate to quadrature points
321 //------------------------------------------------------------------------------
322 template <int NCOMP, int P1d, int Q1d>
323 inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
324   CeedScalar r_t[1];
325   for (CeedInt comp = 0; comp < NCOMP; comp++) {
326     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
327     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
328   }
329 }
330 
331 //------------------------------------------------------------------------------
332 // 2D interpolate transpose
333 //------------------------------------------------------------------------------
334 template <int NCOMP, int P1d, int Q1d>
335 inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
336   CeedScalar r_t[1];
337   for (CeedInt comp = 0; comp < NCOMP; comp++) {
338     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
339     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
340   }
341 }
342 
343 //------------------------------------------------------------------------------
344 // 2D derivatives at quadrature points
345 //------------------------------------------------------------------------------
346 template <int NCOMP, int P1d, int Q1d>
347 inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
348   CeedScalar r_t[1];
349   for (CeedInt comp = 0; comp < NCOMP; comp++) {
350     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
351     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
352     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
353     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
354   }
355 }
356 
357 //------------------------------------------------------------------------------
358 // 2D derivatives transpose
359 //------------------------------------------------------------------------------
360 template <int NCOMP, int P1d, int Q1d>
361 inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
362   CeedScalar r_t[1];
363   for (CeedInt comp = 0; comp < NCOMP; comp++) {
364     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
365     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
366     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
367     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
368   }
369 }
370 
371 //------------------------------------------------------------------------------
372 // 3D
373 //------------------------------------------------------------------------------
374 
375 //------------------------------------------------------------------------------
376 // L-vector -> E-vector, offsets provided
377 //------------------------------------------------------------------------------
378 template <int NCOMP, int COMPSTRIDE, int P1d>
379 inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
380   if (data.tidx < P1d && data.tidy < P1d)
381     for (CeedInt z = 0; z < P1d; ++z) {
382       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
383       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
384       for (CeedInt comp = 0; comp < NCOMP; ++comp)
385         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
386     }
387 }
388 
389 //------------------------------------------------------------------------------
390 // L-vector -> E-vector, strided
391 //------------------------------------------------------------------------------
392 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
393 inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
394   if (data.tidx < P1d && data.tidy < P1d)
395     for (CeedInt z = 0; z < P1d; ++z) {
396       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
397       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
398       for (CeedInt comp = 0; comp < NCOMP; ++comp)
399         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
400     }
401 }
402 
403 //------------------------------------------------------------------------------
404 // E-vector -> Q-vector, offests provided
405 //------------------------------------------------------------------------------
406 template <int NCOMP, int COMPSTRIDE, int Q1d>
407 inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
408   if (data.tidx < Q1d && data.tidy < Q1d) {
409     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
410     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
411     for (CeedInt comp = 0; comp < NCOMP; ++comp)
412       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
413   }
414 }
415 
416 //------------------------------------------------------------------------------
417 // E-vector -> Q-vector, strided
418 //------------------------------------------------------------------------------
419 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
420 inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
421   if (data.tidx < Q1d && data.tidy < Q1d) {
422     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
423     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
424     for (CeedInt comp = 0; comp < NCOMP; ++comp)
425       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
426   }
427 }
428 
429 //------------------------------------------------------------------------------
430 // E-vector -> L-vector, offsets provided
431 //------------------------------------------------------------------------------
432 template <int NCOMP, int COMPSTRIDE, int P1d>
433 inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) {
434   if (data.tidx < P1d && data.tidy < P1d)
435     for (CeedInt z = 0; z < P1d; ++z) {
436       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
437       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
438       for (CeedInt comp = 0; comp < NCOMP; ++comp)
439         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
440     }
441 }
442 
443 //------------------------------------------------------------------------------
444 // E-vector -> L-vector, strided
445 //------------------------------------------------------------------------------
446 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
447 inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) {
448   if (data.tidx < P1d && data.tidy < P1d)
449     for (CeedInt z = 0; z < P1d; ++z) {
450       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
451       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
452       for (CeedInt comp = 0; comp < NCOMP; ++comp)
453         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
454     }
455 }
456 
457 //------------------------------------------------------------------------------
458 // 3D tensor contract x
459 //------------------------------------------------------------------------------
460 template <int NCOMP, int P1d, int Q1d>
461 inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
462   CeedScalar r_B[P1d];
463   for (CeedInt i = 0; i < P1d; ++i)
464     r_B[i] = B[i + data.tidx*P1d];
465 
466   for (CeedInt k = 0; k < P1d; ++k) {
467     data.slice[data.tidx+data.tidy*T1d] = U[k];
468     __syncthreads();
469     V[k] = 0.0;
470     if (data.tidx < Q1d && data.tidy < P1d)
471       for (CeedInt i = 0; i < P1d; ++i)
472         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
473     __syncthreads();
474   }
475 }
476 
477 //------------------------------------------------------------------------------
478 // 3D tensor contract y
479 //------------------------------------------------------------------------------
480 template <int NCOMP, int P1d, int Q1d>
481 inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
482   CeedScalar r_B[P1d];
483   for (CeedInt i = 0; i < P1d; ++i)
484     r_B[i] = B[i + data.tidy*P1d];
485 
486   for (CeedInt k = 0; k < P1d; ++k) {
487     data.slice[data.tidx+data.tidy*T1d] = U[k];
488     __syncthreads();
489     V[k] = 0.0;
490     if (data.tidx < Q1d && data.tidy < Q1d)
491       for (CeedInt i = 0; i < P1d; ++i)
492         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
493     __syncthreads();
494   }
495 }
496 
497 //------------------------------------------------------------------------------
498 // 3D tensor contract z
499 //------------------------------------------------------------------------------
500 template <int NCOMP, int P1d, int Q1d>
501 inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
502   for (CeedInt k = 0; k < Q1d; ++k) {
503     V[k] = 0.0;
504     if (data.tidx < Q1d && data.tidy < Q1d)
505       for (CeedInt i = 0; i < P1d; ++i)
506         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
507   }
508 }
509 
510 //------------------------------------------------------------------------------
511 // 3D transpose tensor contract z
512 //------------------------------------------------------------------------------
513 template <int NCOMP, int P1d, int Q1d>
514 inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
515   for (CeedInt k = 0; k < P1d; ++k) {
516     V[k] = 0.0;
517     if (data.tidx < Q1d && data.tidy < Q1d)
518       for (CeedInt i = 0; i < Q1d; ++i)
519         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
520   }
521 }
522 
523 //------------------------------------------------------------------------------
524 // 3D transpose tensor contract y
525 //------------------------------------------------------------------------------
526 template <int NCOMP, int P1d, int Q1d>
527 inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
528   CeedScalar r_B[Q1d];
529   for (CeedInt i = 0; i < Q1d; ++i)
530     r_B[i] = B[data.tidy + i*P1d];
531 
532   for (CeedInt k = 0; k < P1d; ++k) {
533     data.slice[data.tidx+data.tidy*T1d] = U[k];
534     __syncthreads();
535     V[k] = 0.0;
536     if (data.tidx < Q1d && data.tidy < P1d)
537       for (CeedInt i = 0; i < Q1d; ++i)
538         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
539     __syncthreads();
540   }
541 }
542 
543 //------------------------------------------------------------------------------
544 // 3D transpose tensor contract add y
545 //------------------------------------------------------------------------------
546 template <int NCOMP, int P1d, int Q1d>
547 inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
548   CeedScalar r_B[Q1d];
549   for (CeedInt i = 0; i < Q1d; ++i)
550     r_B[i] = B[data.tidy + i*P1d];
551 
552   for (CeedInt k = 0; k < P1d; ++k) {
553     data.slice[data.tidx+data.tidy*T1d] = U[k];
554     __syncthreads();
555     if (data.tidx < Q1d && data.tidy < P1d)
556       for (CeedInt i = 0; i < Q1d; ++i)
557         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
558     __syncthreads();
559   }
560 }
561 
562 //------------------------------------------------------------------------------
563 // 3D transpose tensor contract x
564 //------------------------------------------------------------------------------
565 template <int NCOMP, int P1d, int Q1d>
566 inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
567   CeedScalar r_B[Q1d];
568   for (CeedInt i = 0; i < Q1d; ++i)
569     r_B[i] = B[data.tidx + i*P1d];
570 
571   for (CeedInt k = 0; k < P1d; ++k) {
572     data.slice[data.tidx+data.tidy*T1d] = U[k];
573     __syncthreads();
574     V[k] = 0.0;
575     if (data.tidx < P1d && data.tidy < P1d)
576       for (CeedInt i = 0; i < Q1d; ++i)
577         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
578     __syncthreads();
579   }
580 }
581 
582 //------------------------------------------------------------------------------
583 // 3D transpose tensor contract add x
584 //------------------------------------------------------------------------------
585 template <int NCOMP, int P1d, int Q1d>
586 inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
587   CeedScalar r_B[Q1d];
588   for (CeedInt i = 0; i < Q1d; ++i)
589     r_B[i] = B[data.tidx + i*P1d];
590 
591   for (CeedInt k = 0; k < P1d; ++k) {
592     data.slice[data.tidx+data.tidy*T1d] = U[k];
593     __syncthreads();
594     if (data.tidx < P1d && data.tidy < P1d)
595       for (CeedInt i = 0; i < Q1d; ++i)
596         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
597     __syncthreads();
598   }
599 }
600 
601 //------------------------------------------------------------------------------
602 // 3D interpolate to quadrature points
603 //------------------------------------------------------------------------------
604 template <int NCOMP, int P1d, int Q1d>
605 inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
606   CeedScalar r_t1[T1d];
607   CeedScalar r_t2[T1d];
608   for (CeedInt comp = 0; comp < NCOMP; comp++) {
609     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
610     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
611     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
612   }
613 }
614 
615 //------------------------------------------------------------------------------
616 // 3D interpolate transpose
617 //------------------------------------------------------------------------------
618 template <int NCOMP, int P1d, int Q1d>
619 inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
620   CeedScalar r_t1[T1d];
621   CeedScalar r_t2[T1d];
622   for (CeedInt comp = 0; comp < NCOMP; comp++) {
623     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
624     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
625     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
626   }
627 }
628 
629 //------------------------------------------------------------------------------
630 // 3D derivatives at quadrature points
631 //------------------------------------------------------------------------------
632 template <int NCOMP, int P1d, int Q1d>
633 inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
634   CeedScalar r_t1[T1d];
635   CeedScalar r_t2[T1d];
636   for (CeedInt comp = 0; comp < NCOMP; comp++) {
637     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
638     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
639     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
640     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
641     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
642     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
643     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
644     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
645     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
646   }
647 }
648 
649 //------------------------------------------------------------------------------
650 // 3D derivatives transpose
651 //------------------------------------------------------------------------------
652 template <int NCOMP, int P1d, int Q1d>
653 inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
654   CeedScalar r_t1[T1d];
655   CeedScalar r_t2[T1d];
656   for (CeedInt comp = 0; comp < NCOMP; comp++) {
657     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
658     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
659     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
660     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
661     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
662     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
663     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
664     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
665     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
666   }
667 }
668 
669 //------------------------------------------------------------------------------
670 // 3D collocated derivatives computation
671 //------------------------------------------------------------------------------
672 template <int NCOMP, int Q1d>
673 inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
674   if (data.tidx < Q1d && data.tidy < Q1d) {
675     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
676       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
677       __syncthreads();
678       // X derivative
679       r_V[comp+0*NCOMP] = 0.0;
680       for (CeedInt i = 0; i < Q1d; ++i)
681         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
682       // Y derivative
683       r_V[comp+1*NCOMP] = 0.0;
684       for (CeedInt i = 0; i < Q1d; ++i)
685         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
686       // Z derivative
687       r_V[comp+2*NCOMP] = 0.0;
688       for (CeedInt i = 0; i < Q1d; ++i)
689         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
690       __syncthreads();
691     }
692   }
693 }
694 
695 //------------------------------------------------------------------------------
696 // 3D collocated derivatives transpose
697 //------------------------------------------------------------------------------
698 template <int NCOMP, int Q1d>
699 inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
700   if (data.tidx < Q1d && data.tidy < Q1d) {
701     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
702       // X derivative
703       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
704       __syncthreads();
705       for (CeedInt i = 0; i < Q1d; ++i)
706         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
707       __syncthreads();
708       // Y derivative
709       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
710       __syncthreads();
711       for (CeedInt i = 0; i < Q1d; ++i)
712         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
713       __syncthreads();
714       // Z derivative
715       for (CeedInt i = 0; i < Q1d; ++i)
716         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
717     }
718   }
719 }
720 
721 //------------------------------------------------------------------------------
722 // 1D quadrature weights
723 //------------------------------------------------------------------------------
724 template <int Q1d>
725 inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
726   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
727 }
728 
729 //------------------------------------------------------------------------------
730 // 2D quadrature weights
731 //------------------------------------------------------------------------------
732 template <int Q1d>
733 inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
734   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
735         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
736 }
737 
738 //------------------------------------------------------------------------------
739 // 3D quadrature weights
740 //------------------------------------------------------------------------------
741 template <int Q1d>
742 inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) {
743   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
744   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
745   for (CeedInt z = 0; z < Q1d; ++z)
746     w[z] = quad ? pw*qweight1d[z] : 0.0;
747 }
748 
749 );
750 //------------------------------------------------------------------------------
751 // Build singe operator kernel
752 //------------------------------------------------------------------------------
753 extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
754 
755   using std::ostringstream;
756   using std::string;
757   int ierr;
758   bool setupdone;
759   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
760   if (setupdone) return CEED_ERROR_SUCCESS;
761   Ceed ceed;
762   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
763   CeedOperator_Cuda_gen *data;
764   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
765   CeedQFunction qf;
766   CeedQFunction_Cuda_gen *qf_data;
767   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
768   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
769   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
770           numoutputfields, ncomp, dim = 0, lsize;
771   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
772   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
773   CeedOperatorField *opinputfields, *opoutputfields;
774   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
775   CeedChkBackend(ierr);
776   CeedQFunctionField *qfinputfields, *qfoutputfields;
777   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
778   CeedChkBackend(ierr);
779   CeedEvalMode emode;
780   CeedBasis basis;
781   CeedBasis_Cuda_shared *basis_data;
782   CeedElemRestriction Erestrict;
783   CeedElemRestriction_Cuda *restr_data;
784 
785   // Check for restriction only identity operator
786   bool is_identity_qf;
787   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
788   if (is_identity_qf) {
789     CeedEvalMode emodein, emodeout;
790     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
791     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
792     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
793       // LCOV_EXCL_START
794       return CeedError(ceed, CEED_ERROR_BACKEND,
795                        "Backend does not implement restriction only identity operators");
796     // LCOV_EXCL_STOP
797   }
798 
799   ostringstream code;
800   string devFunctions(deviceFunctions);
801 
802   // Add atomicAdd function for old NVidia architectures
803   struct cudaDeviceProp prop;
804   Ceed_Cuda *ceed_data;
805   ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr);
806   ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr);
807   if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){
808     code << atomicAdd;
809   }
810 
811   code << devFunctions;
812 
813   string qFunction(qf_data->qFunctionSource);
814   string qFunctionName(qf_data->qFunctionName);
815   string oper;
816   oper = "CeedKernel_Cuda_gen_" + qFunctionName;
817 
818   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
819   code << "#define CEED_QFUNCTION_HELPER inline __device__\n";
820   code << "#define CeedPragmaSIMD\n";
821   code << "#define CEED_ERROR_SUCCESS 0\n\n";
822 
823   // Find dim and Q1d
824   bool useCollograd = true;
825   data->maxP1d = 0;
826   for (CeedInt i = 0; i < numinputfields; i++) {
827     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
828     if (basis != CEED_BASIS_COLLOCATED) {
829       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
830       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
831       CeedChkBackend(ierr);
832 
833       // Check for collocated gradient
834       useCollograd = useCollograd && basis_data->d_collograd1d;
835 
836       // Collect dim and Q1d
837       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
838       bool isTensor;
839       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
840       if (isTensor) {
841         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
842         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
843         if (P1d>data->maxP1d) data->maxP1d = P1d;
844       } else {
845         // LCOV_EXCL_START
846         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
847         // LCOV_EXCL_STOP
848         }
849     }
850   }
851   // Check output bases for Q1d, dim as well
852   //   The only imput basis might be CEED_BASIS_COLLOCATED
853   for (CeedInt i = 0; i < numoutputfields; i++) {
854     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
855 
856     if (basis != CEED_BASIS_COLLOCATED) {
857       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
858       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
859       CeedChkBackend(ierr);
860 
861       // Collect dim and Q1d
862       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
863       bool isTensor;
864       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
865       if (isTensor) {
866         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
867       } else {
868         // LCOV_EXCL_START
869         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
870         // LCOV_EXCL_STOP
871         }
872 
873       // Check for collocated gradient
874       useCollograd = useCollograd && basis_data->d_collograd1d;
875     }
876   }
877   data->dim = dim;
878   data->Q1d = Q1d;
879 
880   // Define CEED_Q_VLA
881   if (dim != 3 || useCollograd) {
882     code << "\n#define CEED_Q_VLA 1\n\n";
883   } else {
884     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
885   }
886 
887   code << qFunction;
888 
889   // Setup
890   code << "\n// -----------------------------------------------------------------------------\n";
891   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
892   for (CeedInt i = 0; i < numinputfields; i++) {
893     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
894     CeedChkBackend(ierr);
895     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
896       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
897     }
898   }
899 
900   for (CeedInt i = 0; i < numoutputfields; i++) {
901     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
902   }
903 
904   code << "  const CeedInt Dim = "<<dim<<";\n";
905   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
906 
907   code << "  extern __shared__ CeedScalar slice[];\n";
908   code << "  BackendData data;\n";
909   code << "  data.tidx = threadIdx.x;\n";
910   code << "  data.tidy = threadIdx.y;\n";
911   code << "  data.tidz = threadIdx.z;\n";
912   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
913   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
914 
915   code << "\n  // -- Input field constants and basis data --\n";
916   //Initialize constants, and matrices B and G
917   for (CeedInt i = 0; i < numinputfields; i++) {
918     code << "  // ---- Input field "<<i<<" ----\n";
919     // Get elemsize, emode, ncomp
920     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
921     CeedChkBackend(ierr);
922     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
923     CeedChkBackend(ierr);
924     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
925     CeedChkBackend(ierr);
926     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
927     CeedChkBackend(ierr);
928 
929     // Set field constants
930     if (emode != CEED_EVAL_WEIGHT) {
931       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
932       if (basis != CEED_BASIS_COLLOCATED) {
933         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
934         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
935       } else {
936         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
937       }
938       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
939     }
940 
941     // Load basis data
942     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
943     switch (emode) {
944     case CEED_EVAL_NONE:
945       break;
946     case CEED_EVAL_INTERP:
947       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
948       data->B.in[i] = basis_data->d_interp1d;
949       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
950       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
951       break;
952     case CEED_EVAL_GRAD:
953       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
954       data->B.in[i] = basis_data->d_interp1d;
955       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
956       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
957       if (useCollograd) {
958         data->G.in[i] = basis_data->d_collograd1d;
959         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
960         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
961       } else {
962         data->G.in[i] = basis_data->d_grad1d;
963         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
964         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
965       }
966       break;
967     case CEED_EVAL_WEIGHT:
968       break; // No action
969     case CEED_EVAL_DIV:
970       break; // TODO: Not implemented
971     case CEED_EVAL_CURL:
972       break; // TODO: Not implemented
973     }
974   }
975 
976   code << "\n  // -- Output field constants and basis data --\n";
977   for (CeedInt i = 0; i < numoutputfields; i++) {
978     code << "  // ---- Output field "<<i<<" ----\n";
979     // Get elemsize, emode, ncomp
980     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
981     CeedChkBackend(ierr);
982     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
983     CeedChkBackend(ierr);
984     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
985     CeedChkBackend(ierr);
986     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
987     CeedChkBackend(ierr);
988 
989     // Set field constants
990     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
991     if (basis != CEED_BASIS_COLLOCATED) {
992       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
993       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
994     } else {
995       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
996     }
997     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
998 
999     // Load basis data
1000     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1001     switch (emode) {
1002     case CEED_EVAL_NONE:
1003       break; // No action
1004     case CEED_EVAL_INTERP:
1005       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1006       data->B.out[i] = basis_data->d_interp1d;
1007       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1008       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1009       break;
1010     case CEED_EVAL_GRAD:
1011       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1012       data->B.out[i] = basis_data->d_interp1d;
1013       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1014       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1015       if (useCollograd) {
1016         data->G.out[i] = basis_data->d_collograd1d;
1017         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1018         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1019       } else {
1020         data->G.out[i] = basis_data->d_grad1d;
1021         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1022         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1023       }
1024       break;
1025     // LCOV_EXCL_START
1026     case CEED_EVAL_WEIGHT: {
1027       Ceed ceed;
1028       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1029       return CeedError(ceed, CEED_ERROR_BACKEND,
1030                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1031       break; // Should not occur
1032     }
1033     case CEED_EVAL_DIV:
1034       break; // TODO: Not implemented
1035     case CEED_EVAL_CURL:
1036       break; // TODO: Not implemented
1037       // LCOV_EXCL_STOP
1038     }
1039   }
1040   code << "\n  // -- Element loop --\n";
1041   code << "  __syncthreads();\n";
1042   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1043   // Input basis apply if needed
1044   // Generate the correct eval mode code for each input
1045   code << "    // -- Input field restrictions and basis actions --\n";
1046   for (CeedInt i = 0; i < numinputfields; i++) {
1047     code << "    // ---- Input field "<<i<<" ----\n";
1048     // Get elemsize, emode, ncomp
1049     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1050     CeedChkBackend(ierr);
1051     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1052     CeedChkBackend(ierr);
1053     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1054     CeedChkBackend(ierr);
1055     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1056     CeedChkBackend(ierr);
1057 
1058     // Restriction
1059     if (emode != CEED_EVAL_WEIGHT &&
1060         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1061       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1062 
1063       bool isStrided;
1064       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1065       if (!isStrided) {
1066         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1067         CeedChkBackend(ierr);
1068         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1069         CeedInt compstride;
1070         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1071         code << "    // CompStride: "<<compstride<<"\n";
1072         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1073         data->indices.in[i] = restr_data->d_ind;
1074         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1075       } else {
1076         bool backendstrides;
1077         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1078         CeedChkBackend(ierr);
1079         CeedInt nelem;
1080         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1081         CeedChkBackend(ierr);
1082         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1083         if (!backendstrides) {
1084           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1085           CeedChkBackend(ierr);
1086         }
1087         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1088         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1089       }
1090     }
1091 
1092     // Basis action
1093     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1094     switch (emode) {
1095     case CEED_EVAL_NONE:
1096       if (!useCollograd) {
1097         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1098       }
1099       break;
1100     case CEED_EVAL_INTERP:
1101       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1102       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1103       break;
1104     case CEED_EVAL_GRAD:
1105       if (useCollograd) {
1106         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1107         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1108       } else {
1109         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1110         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1111       }
1112       break;
1113     case CEED_EVAL_WEIGHT:
1114       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1115       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1116       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1117       data->W = basis_data->d_qweight1d;
1118       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1119       break; // No action
1120     case CEED_EVAL_DIV:
1121       break; // TODO: Not implemented
1122     case CEED_EVAL_CURL:
1123       break; // TODO: Not implemented
1124     }
1125   }
1126 
1127   // Q function
1128   code << "\n    // -- Output field setup --\n";
1129   for (CeedInt i = 0; i < numoutputfields; i++) {
1130       code << "\n    // ---- Output field "<<i<<" ----\n";
1131     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1132     CeedChkBackend(ierr);
1133     if (emode==CEED_EVAL_GRAD)
1134     {
1135       if (useCollograd) {
1136         //Accumulator for gradient slices
1137         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1138         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1139         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1140         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1141         code << "      }\n";
1142         code << "    }\n";
1143       } else {
1144         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1145       }
1146     }
1147     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1148     {
1149       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1150     }
1151   }
1152   // We treat quadrature points per slice in 3d to save registers
1153   if (useCollograd) {
1154     code << "\n    // Note: Collocated Gradient\n";
1155     code << "#pragma unroll\n";
1156     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1157     code << "      // -- Input fields --\n";
1158     for (CeedInt i = 0; i < numinputfields; i++) {
1159       code << "      // ---- Input field "<<i<<" ----\n";
1160       // Get elemsize, emode, ncomp
1161       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1162       CeedChkBackend(ierr);
1163       // Basis action
1164       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1165       switch (emode) {
1166       case CEED_EVAL_NONE:
1167         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1168 
1169         bool isStrided;
1170         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1171         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1172         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1173         if (!isStrided) {
1174           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1175           CeedChkBackend(ierr);
1176           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1177           CeedInt compstride;
1178           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1179           code << "      // CompStride: "<<compstride<<"\n";
1180           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1181           data->indices.in[i] = restr_data->d_ind;
1182           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1183         } else {
1184           bool backendstrides;
1185           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1186           CeedChkBackend(ierr);
1187           CeedInt nelem;
1188           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1189           CeedChkBackend(ierr);
1190           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1191           if (!backendstrides) {
1192             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1193             CeedChkBackend(ierr);
1194           }
1195           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1196           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1197         }
1198         break;
1199       case CEED_EVAL_INTERP:
1200         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1201         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1202         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1203         code << "      }\n";
1204         break;
1205       case CEED_EVAL_GRAD:
1206         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1207         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1208         break;
1209       case CEED_EVAL_WEIGHT:
1210         code << "      CeedScalar r_q"<<i<<"[1];\n";
1211         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1212         break; // No action
1213       case CEED_EVAL_DIV:
1214         break; // TODO: Not implemented
1215       case CEED_EVAL_CURL:
1216         break; // TODO: Not implemented
1217       }
1218     }
1219     code << "\n      // -- Output fields --\n";
1220     for (CeedInt i = 0; i < numoutputfields; i++) {
1221       code << "      // ---- Output field "<<i<<" ----\n";
1222       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1223       CeedChkBackend(ierr);
1224       // Basis action
1225       switch (emode) {
1226       case CEED_EVAL_NONE:
1227         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1228         break; // No action
1229       case CEED_EVAL_INTERP:
1230         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1231         break;
1232       case CEED_EVAL_GRAD:
1233         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1234         break;
1235       case CEED_EVAL_WEIGHT:
1236         break; // Should not occur
1237       case CEED_EVAL_DIV:
1238         break; // TODO: Not implemented
1239       case CEED_EVAL_CURL:
1240         break; // TODO: Not implemented
1241       }
1242     }
1243   } else {
1244     code << "\n      // Note: No Collocated Gradient\n";
1245     code << "      // -- Input fields --\n";
1246     for (CeedInt i = 0; i < numinputfields; i++) {
1247       code << "      // ---- Input field "<<i<<" ----\n";
1248       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1249     }
1250     code << "      // -- Output fields --\n";
1251     for (CeedInt i = 0; i < numoutputfields; i++) {
1252       code << "      // ---- Output field "<<i<<" ----\n";
1253       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1254     }
1255   }
1256   code << "\n      // -- QFunction Inputs and outputs --\n";
1257   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1258   for (CeedInt i = 0; i < numinputfields; i++) {
1259     code << "      // ---- Input field "<<i<<" ----\n";
1260     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1261   }
1262   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1263   for (CeedInt i = 0; i < numoutputfields; i++) {
1264     code << "      // ---- Output field "<<i<<" ----\n";
1265     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1266   }
1267   code << "\n      // -- Apply QFunction --\n";
1268   code << "      "<<qFunctionName<<"(ctx, ";
1269   if (dim != 3 || useCollograd) {
1270     code << "1";
1271   } else {
1272     code << "Q1d";
1273   }
1274   code << ", in, out);\n";
1275   if (useCollograd) {
1276     code << "\n      // Note: Collocated Gradient\n";
1277     code << "      // -- Output fields --\n";
1278     for (CeedInt i = 0; i < numoutputfields; i++) {
1279       code << "      // ---- Output field "<<i<<" ----\n";
1280       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1281       CeedChkBackend(ierr);
1282       // Basis action
1283       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1284       switch (emode) {
1285       case CEED_EVAL_NONE:
1286         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1287         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1288         code << "      }\n";
1289         break; // No action
1290       case CEED_EVAL_INTERP:
1291         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1292         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1293         code << "      }\n";
1294         break;
1295       case CEED_EVAL_GRAD:
1296         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1297         break;
1298       case CEED_EVAL_WEIGHT:
1299         break; // Should not occur
1300       case CEED_EVAL_DIV:
1301         break; // TODO: Not implemented
1302       case CEED_EVAL_CURL:
1303         break; // TODO: Not implemented
1304       }
1305     }
1306     code << "    }\n";
1307   }
1308 
1309   // Output basis apply if needed
1310   // Generate the correct eval mode code for each output
1311   code << "\n    // -- Output field basis action and restrictions --\n";
1312   for (CeedInt i = 0; i < numoutputfields; i++) {
1313     code << "    // ---- Output field "<<i<<" ----\n";
1314     // Get elemsize, emode, ncomp
1315     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1316     CeedChkBackend(ierr);
1317     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1318     CeedChkBackend(ierr);
1319     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1320     CeedChkBackend(ierr);
1321     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1322     CeedChkBackend(ierr);
1323     // Basis action
1324     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1325     switch (emode) {
1326     case CEED_EVAL_NONE:
1327       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1328       break; // No action
1329     case CEED_EVAL_INTERP:
1330       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1331       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1332       break;
1333     case CEED_EVAL_GRAD:
1334       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1335       if (useCollograd) {
1336         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1337       } else {
1338         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1339       }
1340       break;
1341     // LCOV_EXCL_START
1342     case CEED_EVAL_WEIGHT: {
1343       Ceed ceed;
1344       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1345       return CeedError(ceed, CEED_ERROR_BACKEND,
1346                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1347       break; // Should not occur
1348     }
1349     case CEED_EVAL_DIV:
1350       break; // TODO: Not implemented
1351     case CEED_EVAL_CURL:
1352       break; // TODO: Not implemented
1353       // LCOV_EXCL_STOP
1354     }
1355     // Restriction
1356       bool isStrided;
1357       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1358     if (!isStrided) {
1359       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1360       CeedChkBackend(ierr);
1361       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1362       CeedInt compstride;
1363       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1364       code << "    // CompStride: "<<compstride<<"\n";
1365       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1366       data->indices.out[i] = restr_data->d_ind;
1367       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1368     } else {
1369       bool backendstrides;
1370       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1371       CeedChkBackend(ierr);
1372       CeedInt nelem;
1373       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1374       CeedChkBackend(ierr);
1375       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1376       if (!backendstrides) {
1377         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1378         CeedChkBackend(ierr);
1379       }
1380       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1381       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1382     }
1383   }
1384 
1385   code << "  }\n";
1386   code << "}\n";
1387   code << "// -----------------------------------------------------------------------------\n\n";
1388 
1389   // View kernel for debugging
1390   CeedDebug(ceed, code.str().c_str());
1391 
1392   ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1,
1393                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1394   CeedChkBackend(ierr);
1395   ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op);
1396   CeedChkBackend(ierr);
1397 
1398   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1399   return CEED_ERROR_SUCCESS;
1400 }
1401 //------------------------------------------------------------------------------
1402