xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision e7ea6884cf8b840b229fd0457276f9496ddff214)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 #define CEED_DEBUG_COLOR 12
17 
18 #include "ceed-hip-gen.h"
19 #include <iostream>
20 #include <sstream>
21 #include "../hip-shared/ceed-hip-shared.h"
22 #include "../hip/ceed-hip-compile.h"
23 
24 static const char *atomicAdd = QUOTE(
25 //------------------------------------------------------------------------------
26 // Atomic add, for older CUDA
27 //------------------------------------------------------------------------------
28 __device__ double atomicAdd(double *address, double val) {
29   unsigned long long int *address_as_ull = (unsigned long long int *)address;
30   unsigned long long int old = *address_as_ull, assumed;
31   do {
32     assumed = old;
33     old =
34       atomicCAS(address_as_ull, assumed,
35                 __double_as_longlong(val +
36                                      __longlong_as_double(assumed)));
37     // Note: uses integer comparison to avoid hang in case of NaN
38     // (since NaN != NaN)
39   } while (assumed != old);
40   return __longlong_as_double(old);
41 }
42 );
43 
44 static const char *deviceFunctions = QUOTE(
45 
46 //------------------------------------------------------------------------------
47 // Typedefs
48 //------------------------------------------------------------------------------
49 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
50 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
51 
52 typedef struct {
53   CeedInt tidx;
54   CeedInt tidy;
55   CeedInt tidz;
56   CeedInt tid;
57   CeedScalar* slice;
58 } BackendData;
59 
60 //------------------------------------------------------------------------------
61 // Load matrices for basis actions
62 //------------------------------------------------------------------------------
63 template <int P, int Q>
64 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
65   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
66     B[i] = d_B[i];
67 }
68 
69 //------------------------------------------------------------------------------
70 // 1D
71 //------------------------------------------------------------------------------
72 
73 //------------------------------------------------------------------------------
74 // L-vector -> E-vector, offsets provided
75 //------------------------------------------------------------------------------
76 template <int NCOMP, int COMPSTRIDE, int P1d>
77 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
78   if (data.tidx < P1d) {
79     const CeedInt node = data.tidx;
80     const CeedInt ind = indices[node + elem * P1d];
81     for (CeedInt comp = 0; comp < NCOMP; ++comp)
82       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
83   }
84 }
85 
86 //------------------------------------------------------------------------------
87 // L-vector -> E-vector, strided
88 //------------------------------------------------------------------------------
89 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
90 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
91   if (data.tidx < P1d) {
92     const CeedInt node = data.tidx;
93     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
94     for (CeedInt comp = 0; comp < NCOMP; ++comp)
95       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
96   }
97 }
98 
99 //------------------------------------------------------------------------------
100 // E-vector -> L-vector, offsets provided
101 //------------------------------------------------------------------------------
102 template <int NCOMP, int COMPSTRIDE, int P1d>
103 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
104   if (data.tidx < P1d) {
105     const CeedInt node = data.tidx;
106     const CeedInt ind = indices[node + elem * P1d];
107     for (CeedInt comp = 0; comp < NCOMP; ++comp)
108       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
109   }
110 }
111 
112 //------------------------------------------------------------------------------
113 // E-vector -> L-vector, strided
114 //------------------------------------------------------------------------------
115 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
116 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
117   if (data.tidx < P1d) {
118     const CeedInt node = data.tidx;
119     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
120     for (CeedInt comp = 0; comp < NCOMP; ++comp)
121       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
122   }
123 }
124 
125 //------------------------------------------------------------------------------
126 // 1D tensor contraction x
127 //------------------------------------------------------------------------------
128 template <int NCOMP, int P1d, int Q1d>
129 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130   data.slice[data.tidx] = *U;
131   __syncthreads();
132   *V = 0.0;
133   if (data.tidx < Q1d)
134     for (CeedInt i = 0; i < P1d; ++i)
135       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
136   __syncthreads();
137 }
138 
139 //------------------------------------------------------------------------------
140 // 1D transpose tensor contraction x
141 //------------------------------------------------------------------------------
142 template <int NCOMP, int P1d, int Q1d>
143 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
144   data.slice[data.tidx] = *U;
145   __syncthreads();
146   *V = 0.0;
147   if (data.tidx < P1d)
148     for (CeedInt i = 0; i < Q1d; ++i)
149       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
150   __syncthreads();
151 }
152 
153 //------------------------------------------------------------------------------
154 // 1D interpolate to quadrature points
155 //------------------------------------------------------------------------------
156 template <int NCOMP, int P1d, int Q1d>
157 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
158   for (CeedInt comp = 0; comp < NCOMP; comp++)
159     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
160 }
161 
162 //------------------------------------------------------------------------------
163 // 1D interpolate transpose
164 //------------------------------------------------------------------------------
165 template <int NCOMP, int P1d, int Q1d>
166 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
167   for (CeedInt comp=0; comp<NCOMP; comp++)
168     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
169 }
170 
171 //------------------------------------------------------------------------------
172 // 1D derivatives at quadrature points
173 //------------------------------------------------------------------------------
174 template <int NCOMP, int P1d, int Q1d>
175 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
176   for (CeedInt comp = 0; comp < NCOMP; comp++)
177     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
178 }
179 
180 //------------------------------------------------------------------------------
181 // 1D derivatives transpose
182 //------------------------------------------------------------------------------
183 template <int NCOMP, int P1d, int Q1d>
184 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
185   for (CeedInt comp = 0; comp < NCOMP; comp++)
186     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
187 }
188 
189 //------------------------------------------------------------------------------
190 // 2D
191 //------------------------------------------------------------------------------
192 
193 //------------------------------------------------------------------------------
194 // L-vector -> E-vector, offsets provided
195 //------------------------------------------------------------------------------
196 template <int NCOMP, int COMPSTRIDE, int P1d>
197 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
198   if (data.tidx < P1d && data.tidy < P1d) {
199     const CeedInt node = data.tidx + data.tidy*P1d;
200     const CeedInt ind = indices[node + elem * P1d*P1d];
201     for (CeedInt comp = 0; comp < NCOMP; ++comp)
202       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
203   }
204 }
205 
206 //------------------------------------------------------------------------------
207 // L-vector -> E-vector, strided
208 //------------------------------------------------------------------------------
209 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
210 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
211   if (data.tidx < P1d && data.tidy < P1d) {
212     const CeedInt node = data.tidx + data.tidy*P1d;
213     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
214     for (CeedInt comp = 0; comp < NCOMP; ++comp)
215       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
216   }
217 }
218 
219 //------------------------------------------------------------------------------
220 // E-vector -> L-vector, offsets provided
221 //------------------------------------------------------------------------------
222 template <int NCOMP, int COMPSTRIDE, int P1d>
223 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
224   if (data.tidx < P1d && data.tidy < P1d) {
225     const CeedInt node = data.tidx + data.tidy*P1d;
226     const CeedInt ind = indices[node + elem * P1d*P1d];
227     for (CeedInt comp = 0; comp < NCOMP; ++comp)
228       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
229   }
230 }
231 
232 //------------------------------------------------------------------------------
233 // E-vector -> L-vector, strided
234 //------------------------------------------------------------------------------
235 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
236 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
237   if (data.tidx < P1d && data.tidy < P1d) {
238     const CeedInt node = data.tidx + data.tidy*P1d;
239     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
240     for (CeedInt comp = 0; comp < NCOMP; ++comp)
241       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
242   }
243 }
244 
245 //------------------------------------------------------------------------------
246 // 2D tensor contraction x
247 //------------------------------------------------------------------------------
248 template <int NCOMP, int P1d, int Q1d>
249 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
250   data.slice[data.tidx+data.tidy*T1d] = *U;
251   __syncthreads();
252   *V = 0.0;
253   if (data.tidx < Q1d && data.tidy < P1d)
254     for (CeedInt i = 0; i < P1d; ++i)
255       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
256   __syncthreads();
257 }
258 
259 //------------------------------------------------------------------------------
260 // 2D tensor contract y
261 //------------------------------------------------------------------------------
262 template <int NCOMP, int P1d, int Q1d>
263 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
264   data.slice[data.tidx+data.tidy*T1d] = *U;
265   __syncthreads();
266   *V = 0.0;
267   if (data.tidx < Q1d && data.tidy < Q1d)
268     for (CeedInt i = 0; i < P1d; ++i)
269       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
270   __syncthreads();
271 }
272 
273 //------------------------------------------------------------------------------
274 // 2D transpose tensor contract y
275 //------------------------------------------------------------------------------
276 template <int NCOMP, int P1d, int Q1d>
277 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
278   data.slice[data.tidx+data.tidy*T1d] = *U;
279   __syncthreads();
280   *V = 0.0;
281   if (data.tidx < Q1d && data.tidy < P1d)
282     for (CeedInt i = 0; i < Q1d; ++i)
283       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
284   __syncthreads();
285 }
286 
287 //------------------------------------------------------------------------------
288 // 2D transpose tensor contract x
289 //------------------------------------------------------------------------------
290 template <int NCOMP, int P1d, int Q1d>
291 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
292   data.slice[data.tidx+data.tidy*T1d] = *U;
293   __syncthreads();
294   *V = 0.0;
295   if (data.tidx < P1d && data.tidy < P1d)
296     for (CeedInt i = 0; i < Q1d; ++i)
297       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
298   __syncthreads();
299 }
300 
301 //------------------------------------------------------------------------------
302 // 2D transpose tensor contract and add x
303 //------------------------------------------------------------------------------
304 template <int NCOMP, int P1d, int Q1d>
305 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
306   data.slice[data.tidx+data.tidy*T1d] = *U;
307   __syncthreads();
308   if (data.tidx < P1d && data.tidy < P1d)
309     for (CeedInt i = 0; i < Q1d; ++i)
310       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
311   __syncthreads();
312 }
313 
314 //------------------------------------------------------------------------------
315 // 2D interpolate to quadrature points
316 //------------------------------------------------------------------------------
317 template <int NCOMP, int P1d, int Q1d>
318 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
319   CeedScalar r_t[1];
320   for (CeedInt comp = 0; comp < NCOMP; comp++) {
321     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
322     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
323   }
324 }
325 
326 //------------------------------------------------------------------------------
327 // 2D interpolate transpose
328 //------------------------------------------------------------------------------
329 template <int NCOMP, int P1d, int Q1d>
330 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
331   CeedScalar r_t[1];
332   for (CeedInt comp = 0; comp < NCOMP; comp++) {
333     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
334     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
335   }
336 }
337 
338 //------------------------------------------------------------------------------
339 // 2D derivatives at quadrature points
340 //------------------------------------------------------------------------------
341 template <int NCOMP, int P1d, int Q1d>
342 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
343   CeedScalar r_t[1];
344   for (CeedInt comp = 0; comp < NCOMP; comp++) {
345     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
346     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
347     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
348     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
349   }
350 }
351 
352 //------------------------------------------------------------------------------
353 // 2D derivatives transpose
354 //------------------------------------------------------------------------------
355 template <int NCOMP, int P1d, int Q1d>
356 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
357   CeedScalar r_t[1];
358   for (CeedInt comp = 0; comp < NCOMP; comp++) {
359     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
360     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
361     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
362     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
363   }
364 }
365 
366 //------------------------------------------------------------------------------
367 // 3D
368 //------------------------------------------------------------------------------
369 
370 //------------------------------------------------------------------------------
371 // L-vector -> E-vector, offsets provided
372 //------------------------------------------------------------------------------
373 template <int NCOMP, int COMPSTRIDE, int P1d>
374 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
375   if (data.tidx < P1d && data.tidy < P1d)
376     for (CeedInt z = 0; z < P1d; ++z) {
377       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
378       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
379       for (CeedInt comp = 0; comp < NCOMP; ++comp)
380         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
381     }
382 }
383 
384 //------------------------------------------------------------------------------
385 // L-vector -> E-vector, strided
386 //------------------------------------------------------------------------------
387 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
388 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
389   if (data.tidx < P1d && data.tidy < P1d)
390     for (CeedInt z = 0; z < P1d; ++z) {
391       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
392       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
393       for (CeedInt comp = 0; comp < NCOMP; ++comp)
394         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
395     }
396 }
397 
398 //------------------------------------------------------------------------------
399 // E-vector -> Q-vector, offests provided
400 //------------------------------------------------------------------------------
401 template <int NCOMP, int COMPSTRIDE, int Q1d>
402 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
403   if (data.tidx < Q1d && data.tidy < Q1d) {
404     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
405     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
406     for (CeedInt comp = 0; comp < NCOMP; ++comp)
407       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
408   }
409 }
410 
411 //------------------------------------------------------------------------------
412 // E-vector -> Q-vector, strided
413 //------------------------------------------------------------------------------
414 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
415 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
416   if (data.tidx < Q1d && data.tidy < Q1d) {
417     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
418     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
419     for (CeedInt comp = 0; comp < NCOMP; ++comp)
420       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
421   }
422 }
423 
424 //------------------------------------------------------------------------------
425 // E-vector -> L-vector, offsets provided
426 //------------------------------------------------------------------------------
427 template <int NCOMP, int COMPSTRIDE, int P1d>
428 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
429   if (data.tidx < P1d && data.tidy < P1d)
430     for (CeedInt z = 0; z < P1d; ++z) {
431       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
432       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
433       for (CeedInt comp = 0; comp < NCOMP; ++comp)
434         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
435     }
436 }
437 
438 //------------------------------------------------------------------------------
439 // E-vector -> L-vector, strided
440 //------------------------------------------------------------------------------
441 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
442 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
443   if (data.tidx < P1d && data.tidy < P1d)
444     for (CeedInt z = 0; z < P1d; ++z) {
445       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
446       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
447       for (CeedInt comp = 0; comp < NCOMP; ++comp)
448         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
449     }
450 }
451 
452 //------------------------------------------------------------------------------
453 // 3D tensor contract x
454 //------------------------------------------------------------------------------
455 template <int NCOMP, int P1d, int Q1d>
456 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
457   CeedScalar r_B[P1d];
458   for (CeedInt i = 0; i < P1d; ++i)
459     r_B[i] = B[i + data.tidx*P1d];
460 
461   for (CeedInt k = 0; k < P1d; ++k) {
462     data.slice[data.tidx+data.tidy*T1d] = U[k];
463     __syncthreads();
464     V[k] = 0.0;
465     if (data.tidx < Q1d && data.tidy < P1d)
466       for (CeedInt i = 0; i < P1d; ++i)
467         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
468     __syncthreads();
469   }
470 }
471 
472 //------------------------------------------------------------------------------
473 // 3D tensor contract y
474 //------------------------------------------------------------------------------
475 template <int NCOMP, int P1d, int Q1d>
476 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
477   CeedScalar r_B[P1d];
478   for (CeedInt i = 0; i < P1d; ++i)
479     r_B[i] = B[i + data.tidy*P1d];
480 
481   for (CeedInt k = 0; k < P1d; ++k) {
482     data.slice[data.tidx+data.tidy*T1d] = U[k];
483     __syncthreads();
484     V[k] = 0.0;
485     if (data.tidx < Q1d && data.tidy < Q1d)
486       for (CeedInt i = 0; i < P1d; ++i)
487         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
488     __syncthreads();
489   }
490 }
491 
492 //------------------------------------------------------------------------------
493 // 3D tensor contract z
494 //------------------------------------------------------------------------------
495 template <int NCOMP, int P1d, int Q1d>
496 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
497   for (CeedInt k = 0; k < Q1d; ++k) {
498     V[k] = 0.0;
499     if (data.tidx < Q1d && data.tidy < Q1d)
500       for (CeedInt i = 0; i < P1d; ++i)
501         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
502   }
503 }
504 
505 //------------------------------------------------------------------------------
506 // 3D transpose tensor contract z
507 //------------------------------------------------------------------------------
508 template <int NCOMP, int P1d, int Q1d>
509 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
510   for (CeedInt k = 0; k < P1d; ++k) {
511     V[k] = 0.0;
512     if (data.tidx < Q1d && data.tidy < Q1d)
513       for (CeedInt i = 0; i < Q1d; ++i)
514         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
515   }
516 }
517 
518 //------------------------------------------------------------------------------
519 // 3D transpose tensor contract y
520 //------------------------------------------------------------------------------
521 template <int NCOMP, int P1d, int Q1d>
522 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
523   CeedScalar r_B[Q1d];
524   for (CeedInt i = 0; i < Q1d; ++i)
525     r_B[i] = B[data.tidy + i*P1d];
526 
527   for (CeedInt k = 0; k < P1d; ++k) {
528     data.slice[data.tidx+data.tidy*T1d] = U[k];
529     __syncthreads();
530     V[k] = 0.0;
531     if (data.tidx < Q1d && data.tidy < P1d)
532       for (CeedInt i = 0; i < Q1d; ++i)
533         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
534     __syncthreads();
535   }
536 }
537 
538 //------------------------------------------------------------------------------
539 // 3D transpose tensor contract add y
540 //------------------------------------------------------------------------------
541 template <int NCOMP, int P1d, int Q1d>
542 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
543   CeedScalar r_B[Q1d];
544   for (CeedInt i = 0; i < Q1d; ++i)
545     r_B[i] = B[data.tidy + i*P1d];
546 
547   for (CeedInt k = 0; k < P1d; ++k) {
548     data.slice[data.tidx+data.tidy*T1d] = U[k];
549     __syncthreads();
550     if (data.tidx < Q1d && data.tidy < P1d)
551       for (CeedInt i = 0; i < Q1d; ++i)
552         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
553     __syncthreads();
554   }
555 }
556 
557 //------------------------------------------------------------------------------
558 // 3D transpose tensor contract x
559 //------------------------------------------------------------------------------
560 template <int NCOMP, int P1d, int Q1d>
561 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
562   CeedScalar r_B[Q1d];
563   for (CeedInt i = 0; i < Q1d; ++i)
564     r_B[i] = B[data.tidx + i*P1d];
565 
566   for (CeedInt k = 0; k < P1d; ++k) {
567     data.slice[data.tidx+data.tidy*T1d] = U[k];
568     __syncthreads();
569     V[k] = 0.0;
570     if (data.tidx < P1d && data.tidy < P1d)
571       for (CeedInt i = 0; i < Q1d; ++i)
572         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
573     __syncthreads();
574   }
575 }
576 
577 //------------------------------------------------------------------------------
578 // 3D transpose tensor contract add x
579 //------------------------------------------------------------------------------
580 template <int NCOMP, int P1d, int Q1d>
581 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
582   CeedScalar r_B[Q1d];
583   for (CeedInt i = 0; i < Q1d; ++i)
584     r_B[i] = B[data.tidx + i*P1d];
585 
586   for (CeedInt k = 0; k < P1d; ++k) {
587     data.slice[data.tidx+data.tidy*T1d] = U[k];
588     __syncthreads();
589     if (data.tidx < P1d && data.tidy < P1d)
590       for (CeedInt i = 0; i < Q1d; ++i)
591         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
592     __syncthreads();
593   }
594 }
595 
596 //------------------------------------------------------------------------------
597 // 3D interpolate to quadrature points
598 //------------------------------------------------------------------------------
599 template <int NCOMP, int P1d, int Q1d>
600 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
601   CeedScalar r_t1[T1d];
602   CeedScalar r_t2[T1d];
603   for (CeedInt comp = 0; comp < NCOMP; comp++) {
604     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
605     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
606     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
607   }
608 }
609 
610 //------------------------------------------------------------------------------
611 // 3D interpolate transpose
612 //------------------------------------------------------------------------------
613 template <int NCOMP, int P1d, int Q1d>
614 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
615   CeedScalar r_t1[T1d];
616   CeedScalar r_t2[T1d];
617   for (CeedInt comp = 0; comp < NCOMP; comp++) {
618     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
619     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
620     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
621   }
622 }
623 
624 //------------------------------------------------------------------------------
625 // 3D derivatives at quadrature points
626 //------------------------------------------------------------------------------
627 template <int NCOMP, int P1d, int Q1d>
628 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
629   CeedScalar r_t1[T1d];
630   CeedScalar r_t2[T1d];
631   for (CeedInt comp = 0; comp < NCOMP; comp++) {
632     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
633     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
634     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
635     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
636     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
637     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
638     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
639     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
640     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
641   }
642 }
643 
644 //------------------------------------------------------------------------------
645 // 3D derivatives transpose
646 //------------------------------------------------------------------------------
647 template <int NCOMP, int P1d, int Q1d>
648 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
649   CeedScalar r_t1[T1d];
650   CeedScalar r_t2[T1d];
651   for (CeedInt comp = 0; comp < NCOMP; comp++) {
652     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
653     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
654     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
655     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
656     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
657     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
658     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
659     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
660     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
661   }
662 }
663 
664 //------------------------------------------------------------------------------
665 // 3D collocated derivatives computation
666 //------------------------------------------------------------------------------
667 template <int NCOMP, int Q1d>
668 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
669   if (data.tidx < Q1d && data.tidy < Q1d) {
670     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
671       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
672       __syncthreads();
673       // X derivative
674       r_V[comp+0*NCOMP] = 0.0;
675       for (CeedInt i = 0; i < Q1d; ++i)
676         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
677       // Y derivative
678       r_V[comp+1*NCOMP] = 0.0;
679       for (CeedInt i = 0; i < Q1d; ++i)
680         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
681       // Z derivative
682       r_V[comp+2*NCOMP] = 0.0;
683       for (CeedInt i = 0; i < Q1d; ++i)
684         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
685       __syncthreads();
686     }
687   }
688 }
689 
690 //------------------------------------------------------------------------------
691 // 3D collocated derivatives transpose
692 //------------------------------------------------------------------------------
693 template <int NCOMP, int Q1d>
694 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
695   if (data.tidx < Q1d && data.tidy < Q1d) {
696     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
697       // X derivative
698       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
699       __syncthreads();
700       for (CeedInt i = 0; i < Q1d; ++i)
701         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
702       __syncthreads();
703       // Y derivative
704       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
705       __syncthreads();
706       for (CeedInt i = 0; i < Q1d; ++i)
707         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
708       __syncthreads();
709       // Z derivative
710       for (CeedInt i = 0; i < Q1d; ++i)
711         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
712     }
713   }
714 }
715 
716 //------------------------------------------------------------------------------
717 // 1D quadrature weights
718 //------------------------------------------------------------------------------
719 template <int Q1d>
720 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
721   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
722 }
723 
724 //------------------------------------------------------------------------------
725 // 2D quadrature weights
726 //------------------------------------------------------------------------------
727 template <int Q1d>
728 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
729   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
730         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
731 }
732 
733 //------------------------------------------------------------------------------
734 // 3D quadrature weights
735 //------------------------------------------------------------------------------
736 template <int Q1d>
737 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
738   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
739   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
740   for (CeedInt z = 0; z < Q1d; ++z)
741     w[z] = quad ? pw*qweight1d[z] : 0.0;
742 }
743 
744 );
745 //------------------------------------------------------------------------------
746 // Build singe operator kernel
747 //------------------------------------------------------------------------------
748 extern "C" int CeedHipGenOperatorBuild(CeedOperator op) {
749 
750   using std::ostringstream;
751   using std::string;
752   int ierr;
753   bool setupdone;
754   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
755   if (setupdone) return 0;
756   Ceed ceed;
757   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
758   CeedOperator_Hip_gen *data;
759   ierr = CeedOperatorGetData(op, &data); CeedChk(ierr);
760   CeedQFunction qf;
761   CeedQFunction_Hip_gen *qf_data;
762   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
763   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr);
764   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
765           numoutputfields, ncomp, dim = 0, lsize;
766   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
767   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
768   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
769   CeedChk(ierr);
770   CeedOperatorField *opinputfields, *opoutputfields;
771   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
772   CeedChk(ierr);
773   CeedQFunctionField *qfinputfields, *qfoutputfields;
774   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
775   CeedChk(ierr);
776   CeedEvalMode emode;
777   CeedBasis basis;
778   CeedBasis_Hip_shared *basis_data;
779   CeedElemRestriction Erestrict;
780   CeedElemRestriction_Hip *restr_data;
781 
782   ostringstream code;
783   string devFunctions(deviceFunctions);
784 
785   // Add atomicAdd function for old NVidia architectures
786   struct hipDeviceProp_t prop;
787   Ceed_Hip *ceed_data;
788   ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr);
789   ierr = hipGetDeviceProperties(&prop, ceed_data->deviceId); CeedChk(ierr);
790   if (prop.major<6){
791     code << atomicAdd;
792   }
793 
794   code << devFunctions;
795 
796   string qFunction(qf_data->qFunctionSource);
797   string qFunctionName(qf_data->qFunctionName);
798   string oper;
799   oper = "CeedKernel_Hip_gen_" + qFunctionName;
800 
801   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
802   code << "\n#define CeedPragmaSIMD\n";
803 
804   // Find dim and Q1d
805   bool useCollograd = true;
806   data->maxP1d = 0;
807   for (CeedInt i = 0; i < numinputfields; i++) {
808     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
809     if (basis != CEED_BASIS_COLLOCATED) {
810       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
811       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
812       CeedChk(ierr);
813 
814       // Check for collocated gradient
815       useCollograd = useCollograd && basis_data->d_collograd1d;
816 
817       // Collect dim and Q1d
818       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
819       bool isTensor;
820       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
821       if (isTensor) {
822         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
823         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
824         if (P1d>data->maxP1d) data->maxP1d = P1d;
825       } else {
826         // LCOV_EXCL_START
827         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
828         // LCOV_EXCL_STOP
829         }
830     }
831   }
832   // Check output bases for Q1d, dim as well
833   //   The only imput basis might be CEED_BASIS_COLLOCATED
834   for (CeedInt i = 0; i < numoutputfields; i++) {
835     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
836 
837     if (basis != CEED_BASIS_COLLOCATED) {
838       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
839       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
840       CeedChk(ierr);
841 
842       // Collect dim and Q1d
843       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
844       bool isTensor;
845       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
846       if (isTensor) {
847         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
848       } else {
849         // LCOV_EXCL_START
850         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
851         // LCOV_EXCL_STOP
852         }
853 
854       // Check for collocated gradient
855       useCollograd = useCollograd && basis_data->d_collograd1d;
856     }
857   }
858   data->dim = dim;
859   data->Q1d = Q1d;
860 
861   // Define CEED_Q_VLA
862   if (dim != 3 || useCollograd) {
863     code << "\n#define CEED_Q_VLA 1\n\n";
864   } else {
865     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
866   }
867 
868   code << qFunction;
869 
870   // Setup
871   code << "\n// -----------------------------------------------------------------------------\n";
872   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
873   for (CeedInt i = 0; i < numinputfields; i++) {
874     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
875     CeedChk(ierr);
876     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
877       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
878     }
879   }
880 
881   for (CeedInt i = 0; i < numoutputfields; i++) {
882     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
883   }
884 
885   code << "  const CeedInt Dim = "<<dim<<";\n";
886   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
887 
888   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
889   code << "  BackendData data;\n";
890   code << "  data.tidx = threadIdx.x;\n";
891   code << "  data.tidy = threadIdx.y;\n";
892   code << "  data.tidz = threadIdx.z;\n";
893   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
894   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
895 
896   code << "\n  // -- Input field constants and basis data --\n";
897   //Initialize constants, and matrices B and G
898   for (CeedInt i = 0; i < numinputfields; i++) {
899     code << "  // ---- Input field "<<i<<" ----\n";
900     // Get elemsize, emode, ncomp
901     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
902     CeedChk(ierr);
903     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
904     CeedChk(ierr);
905     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
906     CeedChk(ierr);
907     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
908     CeedChk(ierr);
909 
910     // Set field constants
911     if (emode != CEED_EVAL_WEIGHT) {
912       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
913       if (basis != CEED_BASIS_COLLOCATED) {
914         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
915         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
916       } else {
917         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
918       }
919       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
920     }
921 
922     // Load basis data
923     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
924     switch (emode) {
925     case CEED_EVAL_NONE:
926       break;
927     case CEED_EVAL_INTERP:
928       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
929       data->B.in[i] = basis_data->d_interp1d;
930       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
931       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
932       break;
933     case CEED_EVAL_GRAD:
934       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
935       data->B.in[i] = basis_data->d_interp1d;
936       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
937       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
938       if (useCollograd) {
939         data->G.in[i] = basis_data->d_collograd1d;
940         code << "  __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
941         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
942       } else {
943         data->G.in[i] = basis_data->d_grad1d;
944         code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
945         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
946       }
947       break;
948     case CEED_EVAL_WEIGHT:
949       break; // No action
950     case CEED_EVAL_DIV:
951       break; // TODO: Not implemented
952     case CEED_EVAL_CURL:
953       break; // TODO: Not implemented
954     }
955   }
956 
957   code << "\n  // -- Output field constants and basis data --\n";
958   for (CeedInt i = 0; i < numoutputfields; i++) {
959     code << "  // ---- Output field "<<i<<" ----\n";
960     // Get elemsize, emode, ncomp
961     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
962     CeedChk(ierr);
963     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
964     CeedChk(ierr);
965     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
966     CeedChk(ierr);
967     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
968     CeedChk(ierr);
969 
970     // Set field constants
971     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
972     if (basis != CEED_BASIS_COLLOCATED) {
973       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
974       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
975     } else {
976       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
977     }
978     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
979 
980     // Load basis data
981     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
982     switch (emode) {
983     case CEED_EVAL_NONE:
984       break; // No action
985     case CEED_EVAL_INTERP:
986       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
987       data->B.out[i] = basis_data->d_interp1d;
988       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
989       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
990       break;
991     case CEED_EVAL_GRAD:
992       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
993       data->B.out[i] = basis_data->d_interp1d;
994       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
995       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
996       if (useCollograd) {
997         data->G.out[i] = basis_data->d_collograd1d;
998         code << "  __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
999         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1000       } else {
1001         data->G.out[i] = basis_data->d_grad1d;
1002         code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1003         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1004       }
1005       break;
1006     // LCOV_EXCL_START
1007     case CEED_EVAL_WEIGHT: {
1008       Ceed ceed;
1009       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1010       return CeedError(ceed, 1,
1011                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1012       break; // Should not occur
1013     }
1014     case CEED_EVAL_DIV:
1015       break; // TODO: Not implemented
1016     case CEED_EVAL_CURL:
1017       break; // TODO: Not implemented
1018       // LCOV_EXCL_STOP
1019     }
1020   }
1021   code << "\n  // -- Element loop --\n";
1022   code << "  __syncthreads();\n";
1023   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1024   // Input basis apply if needed
1025   // Generate the correct eval mode code for each input
1026   code << "    // -- Input field restrictions and basis actions --\n";
1027   for (CeedInt i = 0; i < numinputfields; i++) {
1028     code << "    // ---- Input field "<<i<<" ----\n";
1029     // Get elemsize, emode, ncomp
1030     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1031     CeedChk(ierr);
1032     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1033     CeedChk(ierr);
1034     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1035     CeedChk(ierr);
1036     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1037     CeedChk(ierr);
1038 
1039     // Restriction
1040     if (emode != CEED_EVAL_WEIGHT &&
1041         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1042       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1043 
1044       bool isStrided;
1045       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1046       if (!isStrided) {
1047         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1048         CeedChk(ierr);
1049         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1050         CeedInt compstride;
1051         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1052         code << "    // CompStride: "<<compstride<<"\n";
1053         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1054         data->indices.in[i] = restr_data->d_ind;
1055         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1056       } else {
1057         bool backendstrides;
1058         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1059         CeedChk(ierr);
1060         CeedInt nelem;
1061         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1062         CeedChk(ierr);
1063         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1064         if (!backendstrides) {
1065           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1066           CeedChk(ierr);
1067         }
1068         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1069         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1070       }
1071     }
1072 
1073     // Basis action
1074     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1075     switch (emode) {
1076     case CEED_EVAL_NONE:
1077       if (!useCollograd) {
1078         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1079       }
1080       break;
1081     case CEED_EVAL_INTERP:
1082       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1083       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1084       break;
1085     case CEED_EVAL_GRAD:
1086       if (useCollograd) {
1087         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1088         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1089       } else {
1090         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1091         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1092       }
1093       break;
1094     case CEED_EVAL_WEIGHT:
1095       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1096       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1097       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
1098       data->W = basis_data->d_qweight1d;
1099       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1100       break; // No action
1101     case CEED_EVAL_DIV:
1102       break; // TODO: Not implemented
1103     case CEED_EVAL_CURL:
1104       break; // TODO: Not implemented
1105     }
1106   }
1107 
1108   // Q function
1109   code << "\n    // -- Output field setup --\n";
1110   for (CeedInt i = 0; i < numoutputfields; i++) {
1111       code << "\n    // ---- Output field "<<i<<" ----\n";
1112     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1113     CeedChk(ierr);
1114     if (emode==CEED_EVAL_GRAD)
1115     {
1116       if (useCollograd) {
1117         //Accumulator for gradient slices
1118         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1119         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1120         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1121         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1122         code << "      }\n";
1123         code << "    }\n";
1124       } else {
1125         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1126       }
1127     }
1128     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1129     {
1130       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1131     }
1132   }
1133   // We treat quadrature points per slice in 3d to save registers
1134   if (useCollograd) {
1135     code << "\n    // Note: Collocated Gradient\n";
1136     code << "#pragma unroll\n";
1137     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1138     code << "      // -- Input fields --\n";
1139     for (CeedInt i = 0; i < numinputfields; i++) {
1140       code << "      // ---- Input field "<<i<<" ----\n";
1141       // Get elemsize, emode, ncomp
1142       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1143       CeedChk(ierr);
1144       // Basis action
1145       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1146       switch (emode) {
1147       case CEED_EVAL_NONE:
1148         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1149 
1150         bool isStrided;
1151         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr);
1152         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr);
1153         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1154         if (!isStrided) {
1155           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1156           CeedChk(ierr);
1157           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1158           CeedInt compstride;
1159           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1160           code << "      // CompStride: "<<compstride<<"\n";
1161           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1162           data->indices.in[i] = restr_data->d_ind;
1163           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1164         } else {
1165           bool backendstrides;
1166           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1167           CeedChk(ierr);
1168           CeedInt nelem;
1169           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1170           CeedChk(ierr);
1171           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1172           if (!backendstrides) {
1173             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1174             CeedChk(ierr);
1175           }
1176           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1177           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1178         }
1179         break;
1180       case CEED_EVAL_INTERP:
1181         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1182         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1183         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1184         code << "      }\n";
1185         break;
1186       case CEED_EVAL_GRAD:
1187         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1188         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1189         break;
1190       case CEED_EVAL_WEIGHT:
1191         code << "      CeedScalar r_q"<<i<<"[1];\n";
1192         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1193         break; // No action
1194       case CEED_EVAL_DIV:
1195         break; // TODO: Not implemented
1196       case CEED_EVAL_CURL:
1197         break; // TODO: Not implemented
1198       }
1199     }
1200     code << "\n      // -- Output fields --\n";
1201     for (CeedInt i = 0; i < numoutputfields; i++) {
1202       code << "      // ---- Output field "<<i<<" ----\n";
1203       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1204       CeedChk(ierr);
1205       // Basis action
1206       switch (emode) {
1207       case CEED_EVAL_NONE:
1208         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1209         break; // No action
1210       case CEED_EVAL_INTERP:
1211         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1212         break;
1213       case CEED_EVAL_GRAD:
1214         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1215         break;
1216       case CEED_EVAL_WEIGHT:
1217         break; // Should not occur
1218       case CEED_EVAL_DIV:
1219         break; // TODO: Not implemented
1220       case CEED_EVAL_CURL:
1221         break; // TODO: Not implemented
1222       }
1223     }
1224   } else {
1225     code << "\n      // Note: No Collocated Gradient\n";
1226     code << "      // -- Input fields --\n";
1227     for (CeedInt i = 0; i < numinputfields; i++) {
1228       code << "      // ---- Input field "<<i<<" ----\n";
1229       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1230     }
1231     code << "      // -- Output fields --\n";
1232     for (CeedInt i = 0; i < numoutputfields; i++) {
1233       code << "      // ---- Output field "<<i<<" ----\n";
1234       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1235     }
1236   }
1237   code << "\n      // -- QFunction Inputs and outputs --\n";
1238   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1239   for (CeedInt i = 0; i < numinputfields; i++) {
1240     code << "      // ---- Input field "<<i<<" ----\n";
1241     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1242   }
1243   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1244   for (CeedInt i = 0; i < numoutputfields; i++) {
1245     code << "      // ---- Output field "<<i<<" ----\n";
1246     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1247   }
1248   code << "\n      // -- Apply QFunction --\n";
1249   code << "      "<<qFunctionName<<"(ctx, ";
1250   if (dim != 3 || useCollograd) {
1251     code << "1";
1252   } else {
1253     code << "Q1d";
1254   }
1255   code << ", in, out);\n";
1256   if (useCollograd) {
1257     code << "\n      // Note: Collocated Gradient\n";
1258     code << "      // -- Output fields --\n";
1259     for (CeedInt i = 0; i < numoutputfields; i++) {
1260       code << "      // ---- Output field "<<i<<" ----\n";
1261       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1262       CeedChk(ierr);
1263       // Basis action
1264       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1265       switch (emode) {
1266       case CEED_EVAL_NONE:
1267         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1268         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1269         code << "      }\n";
1270         break; // No action
1271       case CEED_EVAL_INTERP:
1272         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1273         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1274         code << "      }\n";
1275         break;
1276       case CEED_EVAL_GRAD:
1277         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1278         break;
1279       case CEED_EVAL_WEIGHT:
1280         break; // Should not occur
1281       case CEED_EVAL_DIV:
1282         break; // TODO: Not implemented
1283       case CEED_EVAL_CURL:
1284         break; // TODO: Not implemented
1285       }
1286     }
1287     code << "    }\n";
1288   }
1289 
1290   // Output basis apply if needed
1291   // Generate the correct eval mode code for each output
1292   code << "\n    // -- Output field basis action and restrictions --\n";
1293   for (CeedInt i = 0; i < numoutputfields; i++) {
1294     code << "    // ---- Output field "<<i<<" ----\n";
1295     // Get elemsize, emode, ncomp
1296     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1297     CeedChk(ierr);
1298     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1299     CeedChk(ierr);
1300     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1301     CeedChk(ierr);
1302     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1303     CeedChk(ierr);
1304     // Basis action
1305     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1306     switch (emode) {
1307     case CEED_EVAL_NONE:
1308       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1309       break; // No action
1310     case CEED_EVAL_INTERP:
1311       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1312       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1313       break;
1314     case CEED_EVAL_GRAD:
1315       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1316       if (useCollograd) {
1317         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1318       } else {
1319         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1320       }
1321       break;
1322     // LCOV_EXCL_START
1323     case CEED_EVAL_WEIGHT: {
1324       Ceed ceed;
1325       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1326       return CeedError(ceed, 1,
1327                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1328       break; // Should not occur
1329     }
1330     case CEED_EVAL_DIV:
1331       break; // TODO: Not implemented
1332     case CEED_EVAL_CURL:
1333       break; // TODO: Not implemented
1334       // LCOV_EXCL_STOP
1335     }
1336     // Restriction
1337       bool isStrided;
1338       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1339     if (!isStrided) {
1340       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1341       CeedChk(ierr);
1342       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1343       CeedInt compstride;
1344       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1345       code << "    // CompStride: "<<compstride<<"\n";
1346       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1347       data->indices.out[i] = restr_data->d_ind;
1348       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1349     } else {
1350       bool backendstrides;
1351       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1352       CeedChk(ierr);
1353       CeedInt nelem;
1354       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1355       CeedChk(ierr);
1356       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1357       if (!backendstrides) {
1358         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1359         CeedChk(ierr);
1360       }
1361       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1362       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1363     }
1364   }
1365 
1366   code << "  }\n";
1367   code << "}\n";
1368   code << "// -----------------------------------------------------------------------------\n\n";
1369 
1370   // View kernel for debugging
1371   CeedDebug(code.str().c_str());
1372 
1373   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 1,
1374                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1375   CeedChk(ierr);
1376   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1377   CeedChk(ierr);
1378 
1379   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1380   return 0;
1381 }
1382 //------------------------------------------------------------------------------
1383