xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 3fc8a154943bcd95936b4475143d7e8b037e7b11)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #define CEED_DEBUG_COLOR 12
18 
19 #include <ceed/ceed.h>
20 #include <ceed/backend.h>
21 #include <iostream>
22 #include <string>
23 #include <sstream>
24 #include "ceed-hip-gen.h"
25 #include "../hip-ref/ceed-hip-ref.h"
26 #include "../hip-shared/ceed-hip-shared.h"
27 #include "../hip/ceed-hip-compile.h"
28 
29 
30 //------------------------------------------------------------------------------
31 // Calculate the block size used for launching the operator kernel
32 //------------------------------------------------------------------------------
33 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt nelem,
34      	                                  const CeedInt P1d, const CeedInt Q1d,
35 				          CeedInt *block_sizes) {
36 
37   const CeedInt thread1d = CeedIntMax(Q1d, P1d);
38   if (dim==1) {
39     CeedInt elemsPerBlock = 64*thread1d > 256? 256/thread1d : 64;
40     elemsPerBlock = elemsPerBlock>0?elemsPerBlock:1;
41     block_sizes[0] = thread1d;
42     block_sizes[1] = 1;
43     block_sizes[2] = elemsPerBlock;
44   } else if (dim==2) {
45     const CeedInt elemsPerBlock = thread1d<4? 16 : 2;
46     block_sizes[0] = thread1d;
47     block_sizes[1] = thread1d;
48     block_sizes[2] = elemsPerBlock;
49   } else if (dim==3) {
50     const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1);
51     block_sizes[0] = thread1d;
52     block_sizes[1] = thread1d;
53     block_sizes[2] = elemsPerBlock;
54   }
55   return CEED_ERROR_SUCCESS;
56 }
57 
58 static const char *deviceFunctions = QUOTE(
59 
60 //------------------------------------------------------------------------------
61 // Typedefs
62 //------------------------------------------------------------------------------
63 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
64 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
65 
66 typedef struct {
67   CeedInt tidx;
68   CeedInt tidy;
69   CeedInt tidz;
70   CeedInt tid;
71   CeedScalar* slice;
72 } BackendData;
73 
74 //------------------------------------------------------------------------------
75 // Load matrices for basis actions
76 //------------------------------------------------------------------------------
77 template <int P, int Q>
78 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
79   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
80     B[i] = d_B[i];
81 }
82 
83 //------------------------------------------------------------------------------
84 // 1D
85 //------------------------------------------------------------------------------
86 
87 //------------------------------------------------------------------------------
88 // L-vector -> E-vector, offsets provided
89 //------------------------------------------------------------------------------
90 template <int NCOMP, int COMPSTRIDE, int P1d>
91 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
92   if (data.tidx < P1d) {
93     const CeedInt node = data.tidx;
94     const CeedInt ind = indices[node + elem * P1d];
95     for (CeedInt comp = 0; comp < NCOMP; ++comp)
96       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
97   }
98 }
99 
100 //------------------------------------------------------------------------------
101 // L-vector -> E-vector, strided
102 //------------------------------------------------------------------------------
103 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
104 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
105   if (data.tidx < P1d) {
106     const CeedInt node = data.tidx;
107     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
108     for (CeedInt comp = 0; comp < NCOMP; ++comp)
109       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
110   }
111 }
112 
113 //------------------------------------------------------------------------------
114 // E-vector -> L-vector, offsets provided
115 //------------------------------------------------------------------------------
116 template <int NCOMP, int COMPSTRIDE, int P1d>
117 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
118   if (data.tidx < P1d) {
119     const CeedInt node = data.tidx;
120     const CeedInt ind = indices[node + elem * P1d];
121     for (CeedInt comp = 0; comp < NCOMP; ++comp)
122       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
123   }
124 }
125 
126 //------------------------------------------------------------------------------
127 // E-vector -> L-vector, strided
128 //------------------------------------------------------------------------------
129 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
130 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
131   if (data.tidx < P1d) {
132     const CeedInt node = data.tidx;
133     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
134     for (CeedInt comp = 0; comp < NCOMP; ++comp)
135       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
136   }
137 }
138 
139 //------------------------------------------------------------------------------
140 // 1D tensor contraction x
141 //------------------------------------------------------------------------------
142 template <int NCOMP, int P1d, int Q1d>
143 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
144   data.slice[data.tidx] = *U;
145   __syncthreads();
146   *V = 0.0;
147   if (data.tidx < Q1d)
148     for (CeedInt i = 0; i < P1d; ++i)
149       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
150   __syncthreads();
151 }
152 
153 //------------------------------------------------------------------------------
154 // 1D transpose tensor contraction x
155 //------------------------------------------------------------------------------
156 template <int NCOMP, int P1d, int Q1d>
157 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
158   data.slice[data.tidx] = *U;
159   __syncthreads();
160   *V = 0.0;
161   if (data.tidx < P1d)
162     for (CeedInt i = 0; i < Q1d; ++i)
163       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
164   __syncthreads();
165 }
166 
167 //------------------------------------------------------------------------------
168 // 1D interpolate to quadrature points
169 //------------------------------------------------------------------------------
170 template <int NCOMP, int P1d, int Q1d>
171 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
172   for (CeedInt comp = 0; comp < NCOMP; comp++)
173     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
174 }
175 
176 //------------------------------------------------------------------------------
177 // 1D interpolate transpose
178 //------------------------------------------------------------------------------
179 template <int NCOMP, int P1d, int Q1d>
180 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
181   for (CeedInt comp=0; comp<NCOMP; comp++)
182     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
183 }
184 
185 //------------------------------------------------------------------------------
186 // 1D derivatives at quadrature points
187 //------------------------------------------------------------------------------
188 template <int NCOMP, int P1d, int Q1d>
189 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
190   for (CeedInt comp = 0; comp < NCOMP; comp++)
191     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
192 }
193 
194 //------------------------------------------------------------------------------
195 // 1D derivatives transpose
196 //------------------------------------------------------------------------------
197 template <int NCOMP, int P1d, int Q1d>
198 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
199   for (CeedInt comp = 0; comp < NCOMP; comp++)
200     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
201 }
202 
203 //------------------------------------------------------------------------------
204 // 2D
205 //------------------------------------------------------------------------------
206 
207 //------------------------------------------------------------------------------
208 // L-vector -> E-vector, offsets provided
209 //------------------------------------------------------------------------------
210 template <int NCOMP, int COMPSTRIDE, int P1d>
211 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
212   if (data.tidx < P1d && data.tidy < P1d) {
213     const CeedInt node = data.tidx + data.tidy*P1d;
214     const CeedInt ind = indices[node + elem * P1d*P1d];
215     for (CeedInt comp = 0; comp < NCOMP; ++comp)
216       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
217   }
218 }
219 
220 //------------------------------------------------------------------------------
221 // L-vector -> E-vector, strided
222 //------------------------------------------------------------------------------
223 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
224 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
225   if (data.tidx < P1d && data.tidy < P1d) {
226     const CeedInt node = data.tidx + data.tidy*P1d;
227     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
228     for (CeedInt comp = 0; comp < NCOMP; ++comp)
229       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
230   }
231 }
232 
233 //------------------------------------------------------------------------------
234 // E-vector -> L-vector, offsets provided
235 //------------------------------------------------------------------------------
236 template <int NCOMP, int COMPSTRIDE, int P1d>
237 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
238   if (data.tidx < P1d && data.tidy < P1d) {
239     const CeedInt node = data.tidx + data.tidy*P1d;
240     const CeedInt ind = indices[node + elem * P1d*P1d];
241     for (CeedInt comp = 0; comp < NCOMP; ++comp)
242       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
243   }
244 }
245 
246 //------------------------------------------------------------------------------
247 // E-vector -> L-vector, strided
248 //------------------------------------------------------------------------------
249 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
250 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
251   if (data.tidx < P1d && data.tidy < P1d) {
252     const CeedInt node = data.tidx + data.tidy*P1d;
253     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
254     for (CeedInt comp = 0; comp < NCOMP; ++comp)
255       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
256   }
257 }
258 
259 //------------------------------------------------------------------------------
260 // 2D tensor contraction x
261 //------------------------------------------------------------------------------
262 template <int NCOMP, int P1d, int Q1d>
263 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
264   data.slice[data.tidx+data.tidy*T1d] = *U;
265   __syncthreads();
266   *V = 0.0;
267   if (data.tidx < Q1d && data.tidy < P1d)
268     for (CeedInt i = 0; i < P1d; ++i)
269       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
270   __syncthreads();
271 }
272 
273 //------------------------------------------------------------------------------
274 // 2D tensor contract y
275 //------------------------------------------------------------------------------
276 template <int NCOMP, int P1d, int Q1d>
277 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
278   data.slice[data.tidx+data.tidy*T1d] = *U;
279   __syncthreads();
280   *V = 0.0;
281   if (data.tidx < Q1d && data.tidy < Q1d)
282     for (CeedInt i = 0; i < P1d; ++i)
283       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
284   __syncthreads();
285 }
286 
287 //------------------------------------------------------------------------------
288 // 2D transpose tensor contract y
289 //------------------------------------------------------------------------------
290 template <int NCOMP, int P1d, int Q1d>
291 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
292   data.slice[data.tidx+data.tidy*T1d] = *U;
293   __syncthreads();
294   *V = 0.0;
295   if (data.tidx < Q1d && data.tidy < P1d)
296     for (CeedInt i = 0; i < Q1d; ++i)
297       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
298   __syncthreads();
299 }
300 
301 //------------------------------------------------------------------------------
302 // 2D transpose tensor contract x
303 //------------------------------------------------------------------------------
304 template <int NCOMP, int P1d, int Q1d>
305 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
306   data.slice[data.tidx+data.tidy*T1d] = *U;
307   __syncthreads();
308   *V = 0.0;
309   if (data.tidx < P1d && data.tidy < P1d)
310     for (CeedInt i = 0; i < Q1d; ++i)
311       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
312   __syncthreads();
313 }
314 
315 //------------------------------------------------------------------------------
316 // 2D transpose tensor contract and add x
317 //------------------------------------------------------------------------------
318 template <int NCOMP, int P1d, int Q1d>
319 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
320   data.slice[data.tidx+data.tidy*T1d] = *U;
321   __syncthreads();
322   if (data.tidx < P1d && data.tidy < P1d)
323     for (CeedInt i = 0; i < Q1d; ++i)
324       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
325   __syncthreads();
326 }
327 
328 //------------------------------------------------------------------------------
329 // 2D interpolate to quadrature points
330 //------------------------------------------------------------------------------
331 template <int NCOMP, int P1d, int Q1d>
332 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
333   CeedScalar r_t[1];
334   for (CeedInt comp = 0; comp < NCOMP; comp++) {
335     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
336     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
337   }
338 }
339 
340 //------------------------------------------------------------------------------
341 // 2D interpolate transpose
342 //------------------------------------------------------------------------------
343 template <int NCOMP, int P1d, int Q1d>
344 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
345   CeedScalar r_t[1];
346   for (CeedInt comp = 0; comp < NCOMP; comp++) {
347     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
348     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
349   }
350 }
351 
352 //------------------------------------------------------------------------------
353 // 2D derivatives at quadrature points
354 //------------------------------------------------------------------------------
355 template <int NCOMP, int P1d, int Q1d>
356 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
357   CeedScalar r_t[1];
358   for (CeedInt comp = 0; comp < NCOMP; comp++) {
359     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
360     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
361     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
362     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
363   }
364 }
365 
366 //------------------------------------------------------------------------------
367 // 2D derivatives transpose
368 //------------------------------------------------------------------------------
369 template <int NCOMP, int P1d, int Q1d>
370 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
371   CeedScalar r_t[1];
372   for (CeedInt comp = 0; comp < NCOMP; comp++) {
373     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
374     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
375     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
376     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
377   }
378 }
379 
380 //------------------------------------------------------------------------------
381 // 3D
382 //------------------------------------------------------------------------------
383 
384 //------------------------------------------------------------------------------
385 // L-vector -> E-vector, offsets provided
386 //------------------------------------------------------------------------------
387 template <int NCOMP, int COMPSTRIDE, int P1d>
388 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
389   if (data.tidx < P1d && data.tidy < P1d)
390     for (CeedInt z = 0; z < P1d; ++z) {
391       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
392       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
393       for (CeedInt comp = 0; comp < NCOMP; ++comp)
394         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
395     }
396 }
397 
398 //------------------------------------------------------------------------------
399 // L-vector -> E-vector, strided
400 //------------------------------------------------------------------------------
401 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
402 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
403   if (data.tidx < P1d && data.tidy < P1d)
404     for (CeedInt z = 0; z < P1d; ++z) {
405       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
406       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
407       for (CeedInt comp = 0; comp < NCOMP; ++comp)
408         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
409     }
410 }
411 
412 //------------------------------------------------------------------------------
413 // E-vector -> Q-vector, offests provided
414 //------------------------------------------------------------------------------
415 template <int NCOMP, int COMPSTRIDE, int Q1d>
416 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
417   if (data.tidx < Q1d && data.tidy < Q1d) {
418     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
419     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
420     for (CeedInt comp = 0; comp < NCOMP; ++comp)
421       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
422   }
423 }
424 
425 //------------------------------------------------------------------------------
426 // E-vector -> Q-vector, strided
427 //------------------------------------------------------------------------------
428 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
429 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
430   if (data.tidx < Q1d && data.tidy < Q1d) {
431     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
432     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
433     for (CeedInt comp = 0; comp < NCOMP; ++comp)
434       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
435   }
436 }
437 
438 //------------------------------------------------------------------------------
439 // E-vector -> L-vector, offsets provided
440 //------------------------------------------------------------------------------
441 template <int NCOMP, int COMPSTRIDE, int P1d>
442 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
443   if (data.tidx < P1d && data.tidy < P1d)
444     for (CeedInt z = 0; z < P1d; ++z) {
445       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
446       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
447       for (CeedInt comp = 0; comp < NCOMP; ++comp)
448         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
449     }
450 }
451 
452 //------------------------------------------------------------------------------
453 // E-vector -> L-vector, strided
454 //------------------------------------------------------------------------------
455 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
456 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
457   if (data.tidx < P1d && data.tidy < P1d)
458     for (CeedInt z = 0; z < P1d; ++z) {
459       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
460       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
461       for (CeedInt comp = 0; comp < NCOMP; ++comp)
462         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
463     }
464 }
465 
466 //------------------------------------------------------------------------------
467 // 3D tensor contract x
468 //------------------------------------------------------------------------------
469 template <int NCOMP, int P1d, int Q1d>
470 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
471   CeedScalar r_B[P1d];
472   for (CeedInt i = 0; i < P1d; ++i)
473     r_B[i] = B[i + data.tidx*P1d];
474 
475   for (CeedInt k = 0; k < P1d; ++k) {
476     data.slice[data.tidx+data.tidy*T1d] = U[k];
477     __syncthreads();
478     V[k] = 0.0;
479     if (data.tidx < Q1d && data.tidy < P1d)
480       for (CeedInt i = 0; i < P1d; ++i)
481         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
482     __syncthreads();
483   }
484 }
485 
486 //------------------------------------------------------------------------------
487 // 3D tensor contract y
488 //------------------------------------------------------------------------------
489 template <int NCOMP, int P1d, int Q1d>
490 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
491   CeedScalar r_B[P1d];
492   for (CeedInt i = 0; i < P1d; ++i)
493     r_B[i] = B[i + data.tidy*P1d];
494 
495   for (CeedInt k = 0; k < P1d; ++k) {
496     data.slice[data.tidx+data.tidy*T1d] = U[k];
497     __syncthreads();
498     V[k] = 0.0;
499     if (data.tidx < Q1d && data.tidy < Q1d)
500       for (CeedInt i = 0; i < P1d; ++i)
501         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
502     __syncthreads();
503   }
504 }
505 
506 //------------------------------------------------------------------------------
507 // 3D tensor contract z
508 //------------------------------------------------------------------------------
509 template <int NCOMP, int P1d, int Q1d>
510 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
511   for (CeedInt k = 0; k < Q1d; ++k) {
512     V[k] = 0.0;
513     if (data.tidx < Q1d && data.tidy < Q1d)
514       for (CeedInt i = 0; i < P1d; ++i)
515         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
516   }
517 }
518 
519 //------------------------------------------------------------------------------
520 // 3D transpose tensor contract z
521 //------------------------------------------------------------------------------
522 template <int NCOMP, int P1d, int Q1d>
523 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
524   for (CeedInt k = 0; k < P1d; ++k) {
525     V[k] = 0.0;
526     if (data.tidx < Q1d && data.tidy < Q1d)
527       for (CeedInt i = 0; i < Q1d; ++i)
528         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
529   }
530 }
531 
532 //------------------------------------------------------------------------------
533 // 3D transpose tensor contract y
534 //------------------------------------------------------------------------------
535 template <int NCOMP, int P1d, int Q1d>
536 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
537   CeedScalar r_B[Q1d];
538   for (CeedInt i = 0; i < Q1d; ++i)
539     r_B[i] = B[data.tidy + i*P1d];
540 
541   for (CeedInt k = 0; k < P1d; ++k) {
542     data.slice[data.tidx+data.tidy*T1d] = U[k];
543     __syncthreads();
544     V[k] = 0.0;
545     if (data.tidx < Q1d && data.tidy < P1d)
546       for (CeedInt i = 0; i < Q1d; ++i)
547         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
548     __syncthreads();
549   }
550 }
551 
552 //------------------------------------------------------------------------------
553 // 3D transpose tensor contract add y
554 //------------------------------------------------------------------------------
555 template <int NCOMP, int P1d, int Q1d>
556 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
557   CeedScalar r_B[Q1d];
558   for (CeedInt i = 0; i < Q1d; ++i)
559     r_B[i] = B[data.tidy + i*P1d];
560 
561   for (CeedInt k = 0; k < P1d; ++k) {
562     data.slice[data.tidx+data.tidy*T1d] = U[k];
563     __syncthreads();
564     if (data.tidx < Q1d && data.tidy < P1d)
565       for (CeedInt i = 0; i < Q1d; ++i)
566         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
567     __syncthreads();
568   }
569 }
570 
571 //------------------------------------------------------------------------------
572 // 3D transpose tensor contract x
573 //------------------------------------------------------------------------------
574 template <int NCOMP, int P1d, int Q1d>
575 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
576   CeedScalar r_B[Q1d];
577   for (CeedInt i = 0; i < Q1d; ++i)
578     r_B[i] = B[data.tidx + i*P1d];
579 
580   for (CeedInt k = 0; k < P1d; ++k) {
581     data.slice[data.tidx+data.tidy*T1d] = U[k];
582     __syncthreads();
583     V[k] = 0.0;
584     if (data.tidx < P1d && data.tidy < P1d)
585       for (CeedInt i = 0; i < Q1d; ++i)
586         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
587     __syncthreads();
588   }
589 }
590 
591 //------------------------------------------------------------------------------
592 // 3D transpose tensor contract add x
593 //------------------------------------------------------------------------------
594 template <int NCOMP, int P1d, int Q1d>
595 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
596   CeedScalar r_B[Q1d];
597   for (CeedInt i = 0; i < Q1d; ++i)
598     r_B[i] = B[data.tidx + i*P1d];
599 
600   for (CeedInt k = 0; k < P1d; ++k) {
601     data.slice[data.tidx+data.tidy*T1d] = U[k];
602     __syncthreads();
603     if (data.tidx < P1d && data.tidy < P1d)
604       for (CeedInt i = 0; i < Q1d; ++i)
605         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
606     __syncthreads();
607   }
608 }
609 
610 //------------------------------------------------------------------------------
611 // 3D interpolate to quadrature points
612 //------------------------------------------------------------------------------
613 template <int NCOMP, int P1d, int Q1d>
614 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
615   CeedScalar r_t1[T1d];
616   CeedScalar r_t2[T1d];
617   for (CeedInt comp = 0; comp < NCOMP; comp++) {
618     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
619     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
620     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
621   }
622 }
623 
624 //------------------------------------------------------------------------------
625 // 3D interpolate transpose
626 //------------------------------------------------------------------------------
627 template <int NCOMP, int P1d, int Q1d>
628 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
629   CeedScalar r_t1[T1d];
630   CeedScalar r_t2[T1d];
631   for (CeedInt comp = 0; comp < NCOMP; comp++) {
632     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
633     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
634     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
635   }
636 }
637 
638 //------------------------------------------------------------------------------
639 // 3D derivatives at quadrature points
640 //------------------------------------------------------------------------------
641 template <int NCOMP, int P1d, int Q1d>
642 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
643   CeedScalar r_t1[T1d];
644   CeedScalar r_t2[T1d];
645   for (CeedInt comp = 0; comp < NCOMP; comp++) {
646     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
647     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
648     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
649     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
650     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
651     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
652     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
653     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
654     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
655   }
656 }
657 
658 //------------------------------------------------------------------------------
659 // 3D derivatives transpose
660 //------------------------------------------------------------------------------
661 template <int NCOMP, int P1d, int Q1d>
662 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
663   CeedScalar r_t1[T1d];
664   CeedScalar r_t2[T1d];
665   for (CeedInt comp = 0; comp < NCOMP; comp++) {
666     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
667     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
668     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
669     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
670     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
671     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
672     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
673     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
674     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
675   }
676 }
677 
678 //------------------------------------------------------------------------------
679 // 3D collocated derivatives computation
680 //------------------------------------------------------------------------------
681 template <int NCOMP, int Q1d>
682 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
683   if (data.tidx < Q1d && data.tidy < Q1d) {
684     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
685       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
686       __syncthreads();
687       // X derivative
688       r_V[comp+0*NCOMP] = 0.0;
689       for (CeedInt i = 0; i < Q1d; ++i)
690         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
691       // Y derivative
692       r_V[comp+1*NCOMP] = 0.0;
693       for (CeedInt i = 0; i < Q1d; ++i)
694         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
695       // Z derivative
696       r_V[comp+2*NCOMP] = 0.0;
697       for (CeedInt i = 0; i < Q1d; ++i)
698         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
699       __syncthreads();
700     }
701   }
702 }
703 
704 //------------------------------------------------------------------------------
705 // 3D collocated derivatives transpose
706 //------------------------------------------------------------------------------
707 template <int NCOMP, int Q1d>
708 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
709   if (data.tidx < Q1d && data.tidy < Q1d) {
710     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
711       // X derivative
712       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
713       __syncthreads();
714       for (CeedInt i = 0; i < Q1d; ++i)
715         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
716       __syncthreads();
717       // Y derivative
718       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
719       __syncthreads();
720       for (CeedInt i = 0; i < Q1d; ++i)
721         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
722       __syncthreads();
723       // Z derivative
724       for (CeedInt i = 0; i < Q1d; ++i)
725         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
726     }
727   }
728 }
729 
730 //------------------------------------------------------------------------------
731 // 1D quadrature weights
732 //------------------------------------------------------------------------------
733 template <int Q1d>
734 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
735   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
736 }
737 
738 //------------------------------------------------------------------------------
739 // 2D quadrature weights
740 //------------------------------------------------------------------------------
741 template <int Q1d>
742 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
743   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
744         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
745 }
746 
747 //------------------------------------------------------------------------------
748 // 3D quadrature weights
749 //------------------------------------------------------------------------------
750 template <int Q1d>
751 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
752   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
753   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
754   for (CeedInt z = 0; z < Q1d; ++z)
755     w[z] = quad ? pw*qweight1d[z] : 0.0;
756 }
757 
758 );
759 //------------------------------------------------------------------------------
760 // Build singe operator kernel
761 //------------------------------------------------------------------------------
762 extern "C" int CeedHipGenOperatorBuild(CeedOperator op) {
763 
764   using std::ostringstream;
765   using std::string;
766   int ierr;
767   bool setupdone;
768   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
769   if (setupdone) return CEED_ERROR_SUCCESS;
770   Ceed ceed;
771   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
772   CeedOperator_Hip_gen *data;
773   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
774   CeedQFunction qf;
775   CeedQFunction_Hip_gen *qf_data;
776   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
777   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
778   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
779           numoutputfields, ncomp, dim = 0, lsize;
780   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
781   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
782   CeedOperatorField *opinputfields, *opoutputfields;
783   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
784   CeedChkBackend(ierr);
785   CeedQFunctionField *qfinputfields, *qfoutputfields;
786   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
787   CeedChkBackend(ierr);
788   CeedEvalMode emode;
789   CeedBasis basis;
790   CeedBasis_Hip_shared *basis_data;
791   CeedElemRestriction Erestrict;
792   CeedElemRestriction_Hip *restr_data;
793 
794   // Check for restriction only identity operator
795   bool is_identity_qf;
796   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
797   if (is_identity_qf) {
798     CeedEvalMode emodein, emodeout;
799     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
800     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
801     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
802       // LCOV_EXCL_START
803       return CeedError(ceed, CEED_ERROR_BACKEND,
804                        "Backend does not implement restriction only identity operators");
805     // LCOV_EXCL_STOP
806   }
807 
808   ostringstream code;
809   string devFunctions(deviceFunctions);
810   code << devFunctions;
811 
812   string qFunction(qf_data->qFunctionSource);
813   string qFunctionName(qf_data->qFunctionName);
814   string oper;
815   oper = "CeedKernel_Hip_gen_" + qFunctionName;
816 
817   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
818   code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n";
819   code << "#define CeedPragmaSIMD\n";
820   code << "#define CEED_ERROR_SUCCESS 0\n\n";
821 
822   // Find dim and Q1d
823   bool useCollograd = true;
824   data->maxP1d = 0;
825   for (CeedInt i = 0; i < numinputfields; i++) {
826     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
827     if (basis != CEED_BASIS_COLLOCATED) {
828       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
829       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
830       CeedChkBackend(ierr);
831 
832       // Check for collocated gradient
833       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
834 
835       // Collect dim and Q1d
836       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
837       bool isTensor;
838       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
839       if (isTensor) {
840         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
841         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
842         if (P1d>data->maxP1d) data->maxP1d = P1d;
843       } else {
844         // LCOV_EXCL_START
845         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
846         // LCOV_EXCL_STOP
847         }
848     }
849   }
850   // Check output bases for Q1d, dim as well
851   //   The only imput basis might be CEED_BASIS_COLLOCATED
852   for (CeedInt i = 0; i < numoutputfields; i++) {
853     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
854 
855     if (basis != CEED_BASIS_COLLOCATED) {
856       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
857       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
858       CeedChkBackend(ierr);
859 
860       // Collect dim and Q1d
861       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
862       bool isTensor;
863       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
864       if (isTensor) {
865         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
866       } else {
867         // LCOV_EXCL_START
868         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
869         // LCOV_EXCL_STOP
870         }
871 
872       // Check for collocated gradient
873       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
874     }
875   }
876   data->dim = dim;
877   data->Q1d = Q1d;
878 
879   // Define CEED_Q_VLA
880   if (dim != 3 || useCollograd) {
881     code << "\n#define CEED_Q_VLA 1\n\n";
882   } else {
883     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
884   }
885 
886   code << qFunction;
887 
888   // Setup
889   code << "\n// -----------------------------------------------------------------------------\n";
890   code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
891   code << "__global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
892   for (CeedInt i = 0; i < numinputfields; i++) {
893     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
894     CeedChkBackend(ierr);
895     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
896       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
897     }
898   }
899 
900   for (CeedInt i = 0; i < numoutputfields; i++) {
901     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
902   }
903 
904   code << "  const CeedInt Dim = "<<dim<<";\n";
905   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
906 
907   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
908   code << "  BackendData data;\n";
909   code << "  data.tidx = threadIdx.x;\n";
910   code << "  data.tidy = threadIdx.y;\n";
911   code << "  data.tidz = threadIdx.z;\n";
912   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
913   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
914 
915   code << "\n  // -- Input field constants and basis data --\n";
916   //Initialize constants, and matrices B and G
917   for (CeedInt i = 0; i < numinputfields; i++) {
918     code << "  // ---- Input field "<<i<<" ----\n";
919     // Get elemsize, emode, ncomp
920     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
921     CeedChkBackend(ierr);
922     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
923     CeedChkBackend(ierr);
924     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
925     CeedChkBackend(ierr);
926     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
927     CeedChkBackend(ierr);
928 
929     // Set field constants
930     if (emode != CEED_EVAL_WEIGHT) {
931       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
932       if (basis != CEED_BASIS_COLLOCATED) {
933         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
934         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
935       } else {
936         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
937       }
938       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
939     }
940 
941     // Load basis data
942     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
943     switch (emode) {
944     case CEED_EVAL_NONE:
945       break;
946     case CEED_EVAL_INTERP:
947       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
948       data->B.in[i] = basis_data->d_interp_1d;
949       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
950       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
951       break;
952     case CEED_EVAL_GRAD:
953       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
954       data->B.in[i] = basis_data->d_interp_1d;
955       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
956       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
957       if (useCollograd) {
958         data->G.in[i] = basis_data->d_collo_grad_1d;
959         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
960         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
961       } else {
962         data->G.in[i] = basis_data->d_grad_1d;
963         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
964         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
965       }
966       break;
967     case CEED_EVAL_WEIGHT:
968       break; // No action
969     case CEED_EVAL_DIV:
970       break; // TODO: Not implemented
971     case CEED_EVAL_CURL:
972       break; // TODO: Not implemented
973     }
974   }
975 
976   code << "\n  // -- Output field constants and basis data --\n";
977   for (CeedInt i = 0; i < numoutputfields; i++) {
978     code << "  // ---- Output field "<<i<<" ----\n";
979     // Get elemsize, emode, ncomp
980     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
981     CeedChkBackend(ierr);
982     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
983     CeedChkBackend(ierr);
984     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
985     CeedChkBackend(ierr);
986     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
987     CeedChkBackend(ierr);
988 
989     // Set field constants
990     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
991     if (basis != CEED_BASIS_COLLOCATED) {
992       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
993       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
994     } else {
995       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
996     }
997     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
998 
999     // Load basis data
1000     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1001     switch (emode) {
1002     case CEED_EVAL_NONE:
1003       break; // No action
1004     case CEED_EVAL_INTERP:
1005       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1006       data->B.out[i] = basis_data->d_interp_1d;
1007       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1008       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1009       break;
1010     case CEED_EVAL_GRAD:
1011       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1012       data->B.out[i] = basis_data->d_interp_1d;
1013       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1014       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1015       if (useCollograd) {
1016         data->G.out[i] = basis_data->d_collo_grad_1d;
1017         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1018         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1019       } else {
1020         data->G.out[i] = basis_data->d_grad_1d;
1021         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1022         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1023       }
1024       break;
1025     // LCOV_EXCL_START
1026     case CEED_EVAL_WEIGHT: {
1027       Ceed ceed;
1028       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1029       return CeedError(ceed, CEED_ERROR_BACKEND,
1030                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1031       break; // Should not occur
1032     }
1033     case CEED_EVAL_DIV:
1034       break; // TODO: Not implemented
1035     case CEED_EVAL_CURL:
1036       break; // TODO: Not implemented
1037       // LCOV_EXCL_STOP
1038     }
1039   }
1040   code << "\n  // -- Element loop --\n";
1041   code << "  __syncthreads();\n";
1042   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1043   // Input basis apply if needed
1044   // Generate the correct eval mode code for each input
1045   code << "    // -- Input field restrictions and basis actions --\n";
1046   for (CeedInt i = 0; i < numinputfields; i++) {
1047     code << "    // ---- Input field "<<i<<" ----\n";
1048     // Get elemsize, emode, ncomp
1049     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1050     CeedChkBackend(ierr);
1051     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1052     CeedChkBackend(ierr);
1053     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1054     CeedChkBackend(ierr);
1055     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1056     CeedChkBackend(ierr);
1057 
1058     // Restriction
1059     if (emode != CEED_EVAL_WEIGHT &&
1060         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1061       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1062 
1063       bool isStrided;
1064       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1065       if (!isStrided) {
1066         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1067         CeedChkBackend(ierr);
1068         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1069         CeedInt compstride;
1070         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1071         code << "    // CompStride: "<<compstride<<"\n";
1072         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1073         data->indices.in[i] = restr_data->d_ind;
1074         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1075       } else {
1076         bool backendstrides;
1077         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1078         CeedChkBackend(ierr);
1079         CeedInt nelem;
1080         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1081         CeedChkBackend(ierr);
1082         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1083         if (!backendstrides) {
1084           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1085           CeedChkBackend(ierr);
1086         }
1087         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1088         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1089       }
1090     }
1091 
1092     // Basis action
1093     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1094     switch (emode) {
1095     case CEED_EVAL_NONE:
1096       if (!useCollograd) {
1097         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1098       }
1099       break;
1100     case CEED_EVAL_INTERP:
1101       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1102       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1103       break;
1104     case CEED_EVAL_GRAD:
1105       if (useCollograd) {
1106         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1107         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1108       } else {
1109         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1110         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1111       }
1112       break;
1113     case CEED_EVAL_WEIGHT:
1114       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1115       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1116       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1117       data->W = basis_data->d_q_weight_1d;
1118       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1119       break; // No action
1120     case CEED_EVAL_DIV:
1121       break; // TODO: Not implemented
1122     case CEED_EVAL_CURL:
1123       break; // TODO: Not implemented
1124     }
1125   }
1126 
1127   // Q function
1128   code << "\n    // -- Output field setup --\n";
1129   for (CeedInt i = 0; i < numoutputfields; i++) {
1130       code << "\n    // ---- Output field "<<i<<" ----\n";
1131     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1132     CeedChkBackend(ierr);
1133     if (emode==CEED_EVAL_GRAD)
1134     {
1135       if (useCollograd) {
1136         //Accumulator for gradient slices
1137         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1138         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1139         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1140         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1141         code << "      }\n";
1142         code << "    }\n";
1143       } else {
1144         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1145       }
1146     }
1147     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1148     {
1149       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1150     }
1151   }
1152   // We treat quadrature points per slice in 3d to save registers
1153   if (useCollograd) {
1154     code << "\n    // Note: Collocated Gradient\n";
1155     code << "#pragma unroll\n";
1156     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1157     code << "      // -- Input fields --\n";
1158     for (CeedInt i = 0; i < numinputfields; i++) {
1159       code << "      // ---- Input field "<<i<<" ----\n";
1160       // Get elemsize, emode, ncomp
1161       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1162       CeedChkBackend(ierr);
1163       // Basis action
1164       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1165       switch (emode) {
1166       case CEED_EVAL_NONE:
1167         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1168 
1169         bool isStrided;
1170         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1171         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1172         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1173         if (!isStrided) {
1174           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1175           CeedChkBackend(ierr);
1176           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1177           CeedInt compstride;
1178           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1179           code << "      // CompStride: "<<compstride<<"\n";
1180           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1181           data->indices.in[i] = restr_data->d_ind;
1182           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1183         } else {
1184           bool backendstrides;
1185           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1186           CeedChkBackend(ierr);
1187           CeedInt nelem;
1188           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1189           CeedChkBackend(ierr);
1190           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1191           if (!backendstrides) {
1192             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1193             CeedChkBackend(ierr);
1194           }
1195           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1196           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1197         }
1198         break;
1199       case CEED_EVAL_INTERP:
1200         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1201         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1202         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1203         code << "      }\n";
1204         break;
1205       case CEED_EVAL_GRAD:
1206         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1207         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1208         break;
1209       case CEED_EVAL_WEIGHT:
1210         code << "      CeedScalar r_q"<<i<<"[1];\n";
1211         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1212         break; // No action
1213       case CEED_EVAL_DIV:
1214         break; // TODO: Not implemented
1215       case CEED_EVAL_CURL:
1216         break; // TODO: Not implemented
1217       }
1218     }
1219     code << "\n      // -- Output fields --\n";
1220     for (CeedInt i = 0; i < numoutputfields; i++) {
1221       code << "      // ---- Output field "<<i<<" ----\n";
1222       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1223       CeedChkBackend(ierr);
1224       // Basis action
1225       switch (emode) {
1226       case CEED_EVAL_NONE:
1227         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1228         break; // No action
1229       case CEED_EVAL_INTERP:
1230         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1231         break;
1232       case CEED_EVAL_GRAD:
1233         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1234         break;
1235       case CEED_EVAL_WEIGHT:
1236         break; // Should not occur
1237       case CEED_EVAL_DIV:
1238         break; // TODO: Not implemented
1239       case CEED_EVAL_CURL:
1240         break; // TODO: Not implemented
1241       }
1242     }
1243   } else {
1244     code << "\n      // Note: No Collocated Gradient\n";
1245     code << "      // -- Input fields --\n";
1246     for (CeedInt i = 0; i < numinputfields; i++) {
1247       code << "      // ---- Input field "<<i<<" ----\n";
1248       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1249     }
1250     code << "      // -- Output fields --\n";
1251     for (CeedInt i = 0; i < numoutputfields; i++) {
1252       code << "      // ---- Output field "<<i<<" ----\n";
1253       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1254     }
1255   }
1256   code << "\n      // -- QFunction Inputs and outputs --\n";
1257   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1258   for (CeedInt i = 0; i < numinputfields; i++) {
1259     code << "      // ---- Input field "<<i<<" ----\n";
1260     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1261   }
1262   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1263   for (CeedInt i = 0; i < numoutputfields; i++) {
1264     code << "      // ---- Output field "<<i<<" ----\n";
1265     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1266   }
1267   code << "\n      // -- Apply QFunction --\n";
1268   code << "      "<<qFunctionName<<"(ctx, ";
1269   if (dim != 3 || useCollograd) {
1270     code << "1";
1271   } else {
1272     code << "Q1d";
1273   }
1274   code << ", in, out);\n";
1275   if (useCollograd) {
1276     code << "\n      // Note: Collocated Gradient\n";
1277     code << "      // -- Output fields --\n";
1278     for (CeedInt i = 0; i < numoutputfields; i++) {
1279       code << "      // ---- Output field "<<i<<" ----\n";
1280       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1281       CeedChkBackend(ierr);
1282       // Basis action
1283       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1284       switch (emode) {
1285       case CEED_EVAL_NONE:
1286         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1287         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1288         code << "      }\n";
1289         break; // No action
1290       case CEED_EVAL_INTERP:
1291         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1292         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1293         code << "      }\n";
1294         break;
1295       case CEED_EVAL_GRAD:
1296         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1297         break;
1298       case CEED_EVAL_WEIGHT:
1299         break; // Should not occur
1300       case CEED_EVAL_DIV:
1301         break; // TODO: Not implemented
1302       case CEED_EVAL_CURL:
1303         break; // TODO: Not implemented
1304       }
1305     }
1306     code << "    }\n";
1307   }
1308 
1309   // Output basis apply if needed
1310   // Generate the correct eval mode code for each output
1311   code << "\n    // -- Output field basis action and restrictions --\n";
1312   for (CeedInt i = 0; i < numoutputfields; i++) {
1313     code << "    // ---- Output field "<<i<<" ----\n";
1314     // Get elemsize, emode, ncomp
1315     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1316     CeedChkBackend(ierr);
1317     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1318     CeedChkBackend(ierr);
1319     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1320     CeedChkBackend(ierr);
1321     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1322     CeedChkBackend(ierr);
1323     // Basis action
1324     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1325     switch (emode) {
1326     case CEED_EVAL_NONE:
1327       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1328       break; // No action
1329     case CEED_EVAL_INTERP:
1330       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1331       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1332       break;
1333     case CEED_EVAL_GRAD:
1334       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1335       if (useCollograd) {
1336         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1337       } else {
1338         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1339       }
1340       break;
1341     // LCOV_EXCL_START
1342     case CEED_EVAL_WEIGHT: {
1343       Ceed ceed;
1344       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1345       return CeedError(ceed, CEED_ERROR_BACKEND,
1346                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1347       break; // Should not occur
1348     }
1349     case CEED_EVAL_DIV:
1350       break; // TODO: Not implemented
1351     case CEED_EVAL_CURL:
1352       break; // TODO: Not implemented
1353       // LCOV_EXCL_STOP
1354     }
1355     // Restriction
1356       bool isStrided;
1357       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1358     if (!isStrided) {
1359       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1360       CeedChkBackend(ierr);
1361       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1362       CeedInt compstride;
1363       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1364       code << "    // CompStride: "<<compstride<<"\n";
1365       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1366       data->indices.out[i] = restr_data->d_ind;
1367       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1368     } else {
1369       bool backendstrides;
1370       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1371       CeedChkBackend(ierr);
1372       CeedInt nelem;
1373       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1374       CeedChkBackend(ierr);
1375       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1376       if (!backendstrides) {
1377         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1378         CeedChkBackend(ierr);
1379       }
1380       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1381       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1382     }
1383   }
1384 
1385   code << "  }\n";
1386   code << "}\n";
1387   code << "// -----------------------------------------------------------------------------\n\n";
1388 
1389   // View kernel for debugging
1390   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
1391   CeedDebug(ceed, code.str().c_str());
1392 
1393   CeedInt block_sizes[3];
1394   ierr = BlockGridCalculate_Hip_gen(dim, numelements, data->maxP1d, Q1d, block_sizes);
1395   CeedChkBackend(ierr);
1396   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2,
1397                          "T1d", block_sizes[0],
1398 			 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]);
1399   CeedChkBackend(ierr);
1400   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1401   CeedChkBackend(ierr);
1402 
1403   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1404   return CEED_ERROR_SUCCESS;
1405 }
1406 //------------------------------------------------------------------------------
1407