xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 539ec17d7efe6a80c4ab8b3d6b91c3433981191e)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #define CEED_DEBUG_COLOR 12
18 
19 #include <ceed/ceed.h>
20 #include <ceed/backend.h>
21 #include <iostream>
22 #include <string>
23 #include <sstream>
24 #include "ceed-hip-gen.h"
25 #include "../hip-ref/ceed-hip-ref.h"
26 #include "../hip-shared/ceed-hip-shared.h"
27 #include "../hip/ceed-hip-compile.h"
28 
29 
30 //------------------------------------------------------------------------------
31 // Calculate the block size used for launching the operator kernel
32 //------------------------------------------------------------------------------
33 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt nelem,
34      	                                  const CeedInt P1d, const CeedInt Q1d,
35 				          CeedInt *block_sizes) {
36 
37   const CeedInt thread1d = CeedIntMax(Q1d, P1d);
38   if (dim==1) {
39     CeedInt elemsPerBlock = 64*thread1d > 256? 256/thread1d : 64;
40     elemsPerBlock = elemsPerBlock>0?elemsPerBlock:1;
41     block_sizes[0] = thread1d;
42     block_sizes[1] = 1;
43     block_sizes[2] = elemsPerBlock;
44   } else if (dim==2) {
45     const CeedInt elemsPerBlock = thread1d<4? 16 : 2;
46     block_sizes[0] = thread1d;
47     block_sizes[1] = thread1d;
48     block_sizes[2] = elemsPerBlock;
49   } else if (dim==3) {
50     const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1);
51     block_sizes[0] = thread1d;
52     block_sizes[1] = thread1d;
53     block_sizes[2] = elemsPerBlock;
54   }
55   return CEED_ERROR_SUCCESS;
56 }
57 
58 static const char *deviceFunctions = QUOTE(
59 
60 //------------------------------------------------------------------------------
61 // Typedefs
62 //------------------------------------------------------------------------------
63 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
64 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
65 
66 typedef struct {
67   CeedInt tidx;
68   CeedInt tidy;
69   CeedInt tidz;
70   CeedInt tid;
71   CeedScalar* slice;
72 } BackendData;
73 
74 //------------------------------------------------------------------------------
75 // Load matrices for basis actions
76 //------------------------------------------------------------------------------
77 template <int P, int Q>
78 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
79   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
80     B[i] = d_B[i];
81 }
82 
83 //------------------------------------------------------------------------------
84 // 1D
85 //------------------------------------------------------------------------------
86 
87 //------------------------------------------------------------------------------
88 // L-vector -> E-vector, offsets provided
89 //------------------------------------------------------------------------------
90 template <int NCOMP, int COMPSTRIDE, int P1d>
91 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
92   if (data.tidx < P1d) {
93     const CeedInt node = data.tidx;
94     const CeedInt ind = indices[node + elem * P1d];
95     for (CeedInt comp = 0; comp < NCOMP; ++comp)
96       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
97   }
98 }
99 
100 //------------------------------------------------------------------------------
101 // L-vector -> E-vector, strided
102 //------------------------------------------------------------------------------
103 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
104 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
105   if (data.tidx < P1d) {
106     const CeedInt node = data.tidx;
107     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
108     for (CeedInt comp = 0; comp < NCOMP; ++comp)
109       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
110   }
111 }
112 
113 //------------------------------------------------------------------------------
114 // E-vector -> L-vector, offsets provided
115 //------------------------------------------------------------------------------
116 template <int NCOMP, int COMPSTRIDE, int P1d>
117 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
118   if (data.tidx < P1d) {
119     const CeedInt node = data.tidx;
120     const CeedInt ind = indices[node + elem * P1d];
121     for (CeedInt comp = 0; comp < NCOMP; ++comp)
122       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
123   }
124 }
125 
126 //------------------------------------------------------------------------------
127 // E-vector -> L-vector, strided
128 //------------------------------------------------------------------------------
129 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
130 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
131   if (data.tidx < P1d) {
132     const CeedInt node = data.tidx;
133     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
134     for (CeedInt comp = 0; comp < NCOMP; ++comp)
135       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
136   }
137 }
138 
139 //------------------------------------------------------------------------------
140 // 1D tensor contraction x
141 //------------------------------------------------------------------------------
142 template <int NCOMP, int P1d, int Q1d>
143 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
144   data.slice[data.tidx] = *U;
145   __syncthreads();
146   *V = 0.0;
147   if (data.tidx < Q1d)
148     for (CeedInt i = 0; i < P1d; ++i)
149       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
150   __syncthreads();
151 }
152 
153 //------------------------------------------------------------------------------
154 // 1D transpose tensor contraction x
155 //------------------------------------------------------------------------------
156 template <int NCOMP, int P1d, int Q1d>
157 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
158   data.slice[data.tidx] = *U;
159   __syncthreads();
160   *V = 0.0;
161   if (data.tidx < P1d)
162     for (CeedInt i = 0; i < Q1d; ++i)
163       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
164   __syncthreads();
165 }
166 
167 //------------------------------------------------------------------------------
168 // 1D interpolate to quadrature points
169 //------------------------------------------------------------------------------
170 template <int NCOMP, int P1d, int Q1d>
171 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
172   for (CeedInt comp = 0; comp < NCOMP; comp++)
173     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
174 }
175 
176 //------------------------------------------------------------------------------
177 // 1D interpolate transpose
178 //------------------------------------------------------------------------------
179 template <int NCOMP, int P1d, int Q1d>
180 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
181   for (CeedInt comp=0; comp<NCOMP; comp++)
182     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
183 }
184 
185 //------------------------------------------------------------------------------
186 // 1D derivatives at quadrature points
187 //------------------------------------------------------------------------------
188 template <int NCOMP, int P1d, int Q1d>
189 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
190   for (CeedInt comp = 0; comp < NCOMP; comp++)
191     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
192 }
193 
194 //------------------------------------------------------------------------------
195 // 1D derivatives transpose
196 //------------------------------------------------------------------------------
197 template <int NCOMP, int P1d, int Q1d>
198 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
199   for (CeedInt comp = 0; comp < NCOMP; comp++)
200     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
201 }
202 
203 //------------------------------------------------------------------------------
204 // 2D
205 //------------------------------------------------------------------------------
206 
207 //------------------------------------------------------------------------------
208 // L-vector -> E-vector, offsets provided
209 //------------------------------------------------------------------------------
210 template <int NCOMP, int COMPSTRIDE, int P1d>
211 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
212   if (data.tidx < P1d && data.tidy < P1d) {
213     const CeedInt node = data.tidx + data.tidy*P1d;
214     const CeedInt ind = indices[node + elem * P1d*P1d];
215     for (CeedInt comp = 0; comp < NCOMP; ++comp)
216       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
217   }
218 }
219 
220 //------------------------------------------------------------------------------
221 // L-vector -> E-vector, strided
222 //------------------------------------------------------------------------------
223 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
224 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
225   if (data.tidx < P1d && data.tidy < P1d) {
226     const CeedInt node = data.tidx + data.tidy*P1d;
227     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
228     for (CeedInt comp = 0; comp < NCOMP; ++comp)
229       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
230   }
231 }
232 
233 //------------------------------------------------------------------------------
234 // E-vector -> L-vector, offsets provided
235 //------------------------------------------------------------------------------
236 template <int NCOMP, int COMPSTRIDE, int P1d>
237 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
238   if (data.tidx < P1d && data.tidy < P1d) {
239     const CeedInt node = data.tidx + data.tidy*P1d;
240     const CeedInt ind = indices[node + elem * P1d*P1d];
241     for (CeedInt comp = 0; comp < NCOMP; ++comp)
242       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
243   }
244 }
245 
246 //------------------------------------------------------------------------------
247 // E-vector -> L-vector, strided
248 //------------------------------------------------------------------------------
249 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
250 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
251   if (data.tidx < P1d && data.tidy < P1d) {
252     const CeedInt node = data.tidx + data.tidy*P1d;
253     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
254     for (CeedInt comp = 0; comp < NCOMP; ++comp)
255       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
256   }
257 }
258 
259 //------------------------------------------------------------------------------
260 // 2D tensor contraction x
261 //------------------------------------------------------------------------------
262 template <int NCOMP, int P1d, int Q1d>
263 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
264   data.slice[data.tidx+data.tidy*T1d] = *U;
265   __syncthreads();
266   *V = 0.0;
267   if (data.tidx < Q1d && data.tidy < P1d)
268     for (CeedInt i = 0; i < P1d; ++i)
269       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
270   __syncthreads();
271 }
272 
273 //------------------------------------------------------------------------------
274 // 2D tensor contract y
275 //------------------------------------------------------------------------------
276 template <int NCOMP, int P1d, int Q1d>
277 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
278   data.slice[data.tidx+data.tidy*T1d] = *U;
279   __syncthreads();
280   *V = 0.0;
281   if (data.tidx < Q1d && data.tidy < Q1d)
282     for (CeedInt i = 0; i < P1d; ++i)
283       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
284   __syncthreads();
285 }
286 
287 //------------------------------------------------------------------------------
288 // 2D transpose tensor contract y
289 //------------------------------------------------------------------------------
290 template <int NCOMP, int P1d, int Q1d>
291 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
292   data.slice[data.tidx+data.tidy*T1d] = *U;
293   __syncthreads();
294   *V = 0.0;
295   if (data.tidx < Q1d && data.tidy < P1d)
296     for (CeedInt i = 0; i < Q1d; ++i)
297       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
298   __syncthreads();
299 }
300 
301 //------------------------------------------------------------------------------
302 // 2D transpose tensor contract x
303 //------------------------------------------------------------------------------
304 template <int NCOMP, int P1d, int Q1d>
305 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
306   data.slice[data.tidx+data.tidy*T1d] = *U;
307   __syncthreads();
308   *V = 0.0;
309   if (data.tidx < P1d && data.tidy < P1d)
310     for (CeedInt i = 0; i < Q1d; ++i)
311       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
312   __syncthreads();
313 }
314 
315 //------------------------------------------------------------------------------
316 // 2D transpose tensor contract and add x
317 //------------------------------------------------------------------------------
318 template <int NCOMP, int P1d, int Q1d>
319 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
320   data.slice[data.tidx+data.tidy*T1d] = *U;
321   __syncthreads();
322   if (data.tidx < P1d && data.tidy < P1d)
323     for (CeedInt i = 0; i < Q1d; ++i)
324       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
325   __syncthreads();
326 }
327 
328 //------------------------------------------------------------------------------
329 // 2D interpolate to quadrature points
330 //------------------------------------------------------------------------------
331 template <int NCOMP, int P1d, int Q1d>
332 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
333   CeedScalar r_t[1];
334   for (CeedInt comp = 0; comp < NCOMP; comp++) {
335     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
336     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
337   }
338 }
339 
340 //------------------------------------------------------------------------------
341 // 2D interpolate transpose
342 //------------------------------------------------------------------------------
343 template <int NCOMP, int P1d, int Q1d>
344 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
345   CeedScalar r_t[1];
346   for (CeedInt comp = 0; comp < NCOMP; comp++) {
347     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
348     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
349   }
350 }
351 
352 //------------------------------------------------------------------------------
353 // 2D derivatives at quadrature points
354 //------------------------------------------------------------------------------
355 template <int NCOMP, int P1d, int Q1d>
356 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
357   CeedScalar r_t[1];
358   for (CeedInt comp = 0; comp < NCOMP; comp++) {
359     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
360     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
361     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
362     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
363   }
364 }
365 
366 //------------------------------------------------------------------------------
367 // 2D derivatives transpose
368 //------------------------------------------------------------------------------
369 template <int NCOMP, int P1d, int Q1d>
370 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
371   CeedScalar r_t[1];
372   for (CeedInt comp = 0; comp < NCOMP; comp++) {
373     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
374     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
375     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
376     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
377   }
378 }
379 
380 //------------------------------------------------------------------------------
381 // 3D
382 //------------------------------------------------------------------------------
383 
384 //------------------------------------------------------------------------------
385 // L-vector -> E-vector, offsets provided
386 //------------------------------------------------------------------------------
387 template <int NCOMP, int COMPSTRIDE, int P1d>
388 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
389   if (data.tidx < P1d && data.tidy < P1d)
390     for (CeedInt z = 0; z < P1d; ++z) {
391       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
392       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
393       for (CeedInt comp = 0; comp < NCOMP; ++comp)
394         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
395     }
396 }
397 
398 //------------------------------------------------------------------------------
399 // L-vector -> E-vector, strided
400 //------------------------------------------------------------------------------
401 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
402 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
403   if (data.tidx < P1d && data.tidy < P1d)
404     for (CeedInt z = 0; z < P1d; ++z) {
405       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
406       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
407       for (CeedInt comp = 0; comp < NCOMP; ++comp)
408         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
409     }
410 }
411 
412 //------------------------------------------------------------------------------
413 // E-vector -> Q-vector, offests provided
414 //------------------------------------------------------------------------------
415 template <int NCOMP, int COMPSTRIDE, int Q1d>
416 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
417   if (data.tidx < Q1d && data.tidy < Q1d) {
418     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
419     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
420     for (CeedInt comp = 0; comp < NCOMP; ++comp)
421       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
422   }
423 }
424 
425 //------------------------------------------------------------------------------
426 // E-vector -> Q-vector, strided
427 //------------------------------------------------------------------------------
428 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
429 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
430   if (data.tidx < Q1d && data.tidy < Q1d) {
431     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
432     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
433     for (CeedInt comp = 0; comp < NCOMP; ++comp)
434       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
435   }
436 }
437 
438 //------------------------------------------------------------------------------
439 // E-vector -> L-vector, offsets provided
440 //------------------------------------------------------------------------------
441 template <int NCOMP, int COMPSTRIDE, int P1d>
442 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
443   if (data.tidx < P1d && data.tidy < P1d)
444     for (CeedInt z = 0; z < P1d; ++z) {
445       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
446       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
447       for (CeedInt comp = 0; comp < NCOMP; ++comp)
448         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
449     }
450 }
451 
452 //------------------------------------------------------------------------------
453 // E-vector -> L-vector, strided
454 //------------------------------------------------------------------------------
455 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
456 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
457   if (data.tidx < P1d && data.tidy < P1d)
458     for (CeedInt z = 0; z < P1d; ++z) {
459       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
460       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
461       for (CeedInt comp = 0; comp < NCOMP; ++comp)
462         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
463     }
464 }
465 
466 //------------------------------------------------------------------------------
467 // 3D tensor contract x
468 //------------------------------------------------------------------------------
469 template <int NCOMP, int P1d, int Q1d>
470 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
471   CeedScalar r_B[P1d];
472   for (CeedInt i = 0; i < P1d; ++i)
473     r_B[i] = B[i + data.tidx*P1d];
474 
475   for (CeedInt k = 0; k < P1d; ++k) {
476     data.slice[data.tidx+data.tidy*T1d] = U[k];
477     __syncthreads();
478     V[k] = 0.0;
479     if (data.tidx < Q1d && data.tidy < P1d)
480       for (CeedInt i = 0; i < P1d; ++i)
481         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
482     __syncthreads();
483   }
484 }
485 
486 //------------------------------------------------------------------------------
487 // 3D tensor contract y
488 //------------------------------------------------------------------------------
489 template <int NCOMP, int P1d, int Q1d>
490 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
491   CeedScalar r_B[P1d];
492   for (CeedInt i = 0; i < P1d; ++i)
493     r_B[i] = B[i + data.tidy*P1d];
494 
495   for (CeedInt k = 0; k < P1d; ++k) {
496     data.slice[data.tidx+data.tidy*T1d] = U[k];
497     __syncthreads();
498     V[k] = 0.0;
499     if (data.tidx < Q1d && data.tidy < Q1d)
500       for (CeedInt i = 0; i < P1d; ++i)
501         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
502     __syncthreads();
503   }
504 }
505 
506 //------------------------------------------------------------------------------
507 // 3D tensor contract z
508 //------------------------------------------------------------------------------
509 template <int NCOMP, int P1d, int Q1d>
510 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
511   for (CeedInt k = 0; k < Q1d; ++k) {
512     V[k] = 0.0;
513     if (data.tidx < Q1d && data.tidy < Q1d)
514       for (CeedInt i = 0; i < P1d; ++i)
515         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
516   }
517 }
518 
519 //------------------------------------------------------------------------------
520 // 3D transpose tensor contract z
521 //------------------------------------------------------------------------------
522 template <int NCOMP, int P1d, int Q1d>
523 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
524   for (CeedInt k = 0; k < P1d; ++k) {
525     V[k] = 0.0;
526     if (data.tidx < Q1d && data.tidy < Q1d)
527       for (CeedInt i = 0; i < Q1d; ++i)
528         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
529   }
530 }
531 
532 //------------------------------------------------------------------------------
533 // 3D transpose tensor contract y
534 //------------------------------------------------------------------------------
535 template <int NCOMP, int P1d, int Q1d>
536 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
537   CeedScalar r_B[Q1d];
538   for (CeedInt i = 0; i < Q1d; ++i)
539     r_B[i] = B[data.tidy + i*P1d];
540 
541   for (CeedInt k = 0; k < P1d; ++k) {
542     data.slice[data.tidx+data.tidy*T1d] = U[k];
543     __syncthreads();
544     V[k] = 0.0;
545     if (data.tidx < Q1d && data.tidy < P1d)
546       for (CeedInt i = 0; i < Q1d; ++i)
547         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
548     __syncthreads();
549   }
550 }
551 
552 //------------------------------------------------------------------------------
553 // 3D transpose tensor contract add y
554 //------------------------------------------------------------------------------
555 template <int NCOMP, int P1d, int Q1d>
556 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
557   CeedScalar r_B[Q1d];
558   for (CeedInt i = 0; i < Q1d; ++i)
559     r_B[i] = B[data.tidy + i*P1d];
560 
561   for (CeedInt k = 0; k < P1d; ++k) {
562     data.slice[data.tidx+data.tidy*T1d] = U[k];
563     __syncthreads();
564     if (data.tidx < Q1d && data.tidy < P1d)
565       for (CeedInt i = 0; i < Q1d; ++i)
566         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
567     __syncthreads();
568   }
569 }
570 
571 //------------------------------------------------------------------------------
572 // 3D transpose tensor contract x
573 //------------------------------------------------------------------------------
574 template <int NCOMP, int P1d, int Q1d>
575 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
576   CeedScalar r_B[Q1d];
577   for (CeedInt i = 0; i < Q1d; ++i)
578     r_B[i] = B[data.tidx + i*P1d];
579 
580   for (CeedInt k = 0; k < P1d; ++k) {
581     data.slice[data.tidx+data.tidy*T1d] = U[k];
582     __syncthreads();
583     V[k] = 0.0;
584     if (data.tidx < P1d && data.tidy < P1d)
585       for (CeedInt i = 0; i < Q1d; ++i)
586         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
587     __syncthreads();
588   }
589 }
590 
591 //------------------------------------------------------------------------------
592 // 3D transpose tensor contract add x
593 //------------------------------------------------------------------------------
594 template <int NCOMP, int P1d, int Q1d>
595 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
596   CeedScalar r_B[Q1d];
597   for (CeedInt i = 0; i < Q1d; ++i)
598     r_B[i] = B[data.tidx + i*P1d];
599 
600   for (CeedInt k = 0; k < P1d; ++k) {
601     data.slice[data.tidx+data.tidy*T1d] = U[k];
602     __syncthreads();
603     if (data.tidx < P1d && data.tidy < P1d)
604       for (CeedInt i = 0; i < Q1d; ++i)
605         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
606     __syncthreads();
607   }
608 }
609 
610 //------------------------------------------------------------------------------
611 // 3D interpolate to quadrature points
612 //------------------------------------------------------------------------------
613 template <int NCOMP, int P1d, int Q1d>
614 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
615   CeedScalar r_t1[T1d];
616   CeedScalar r_t2[T1d];
617   for (CeedInt comp = 0; comp < NCOMP; comp++) {
618     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
619     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
620     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
621   }
622 }
623 
624 //------------------------------------------------------------------------------
625 // 3D interpolate transpose
626 //------------------------------------------------------------------------------
627 template <int NCOMP, int P1d, int Q1d>
628 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
629   CeedScalar r_t1[T1d];
630   CeedScalar r_t2[T1d];
631   for (CeedInt comp = 0; comp < NCOMP; comp++) {
632     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
633     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
634     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
635   }
636 }
637 
638 //------------------------------------------------------------------------------
639 // 3D derivatives at quadrature points
640 //------------------------------------------------------------------------------
641 template <int NCOMP, int P1d, int Q1d>
642 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
643   CeedScalar r_t1[T1d];
644   CeedScalar r_t2[T1d];
645   for (CeedInt comp = 0; comp < NCOMP; comp++) {
646     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
647     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
648     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
649     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
650     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
651     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
652     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
653     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
654     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
655   }
656 }
657 
658 //------------------------------------------------------------------------------
659 // 3D derivatives transpose
660 //------------------------------------------------------------------------------
661 template <int NCOMP, int P1d, int Q1d>
662 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
663   CeedScalar r_t1[T1d];
664   CeedScalar r_t2[T1d];
665   for (CeedInt comp = 0; comp < NCOMP; comp++) {
666     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
667     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
668     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
669     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
670     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
671     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
672     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
673     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
674     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
675   }
676 }
677 
678 //------------------------------------------------------------------------------
679 // 3D collocated derivatives computation
680 //------------------------------------------------------------------------------
681 template <int NCOMP, int Q1d>
682 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
683   if (data.tidx < Q1d && data.tidy < Q1d) {
684     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
685       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
686       __syncthreads();
687       // X derivative
688       r_V[comp+0*NCOMP] = 0.0;
689       for (CeedInt i = 0; i < Q1d; ++i)
690         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
691       // Y derivative
692       r_V[comp+1*NCOMP] = 0.0;
693       for (CeedInt i = 0; i < Q1d; ++i)
694         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
695       // Z derivative
696       r_V[comp+2*NCOMP] = 0.0;
697       for (CeedInt i = 0; i < Q1d; ++i)
698         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
699       __syncthreads();
700     }
701   }
702 }
703 
704 //------------------------------------------------------------------------------
705 // 3D collocated derivatives transpose
706 //------------------------------------------------------------------------------
707 template <int NCOMP, int Q1d>
708 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
709   if (data.tidx < Q1d && data.tidy < Q1d) {
710     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
711       // X derivative
712       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
713       __syncthreads();
714       for (CeedInt i = 0; i < Q1d; ++i)
715         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
716       __syncthreads();
717       // Y derivative
718       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
719       __syncthreads();
720       for (CeedInt i = 0; i < Q1d; ++i)
721         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
722       __syncthreads();
723       // Z derivative
724       for (CeedInt i = 0; i < Q1d; ++i)
725         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
726     }
727   }
728 }
729 
730 //------------------------------------------------------------------------------
731 // 1D quadrature weights
732 //------------------------------------------------------------------------------
733 template <int Q1d>
734 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
735   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
736 }
737 
738 //------------------------------------------------------------------------------
739 // 2D quadrature weights
740 //------------------------------------------------------------------------------
741 template <int Q1d>
742 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
743   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
744         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
745 }
746 
747 //------------------------------------------------------------------------------
748 // 3D quadrature weights
749 //------------------------------------------------------------------------------
750 template <int Q1d>
751 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
752   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
753   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
754   for (CeedInt z = 0; z < Q1d; ++z)
755     w[z] = quad ? pw*qweight1d[z] : 0.0;
756 }
757 
758 );
759 //------------------------------------------------------------------------------
760 // Build singe operator kernel
761 //------------------------------------------------------------------------------
762 extern "C" int CeedHipGenOperatorBuild(CeedOperator op) {
763 
764   using std::ostringstream;
765   using std::string;
766   int ierr;
767   bool setupdone;
768   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
769   if (setupdone) return CEED_ERROR_SUCCESS;
770   Ceed ceed;
771   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
772   CeedOperator_Hip_gen *data;
773   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
774   CeedQFunction qf;
775   CeedQFunction_Hip_gen *qf_data;
776   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
777   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
778   CeedSize lsize;
779   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
780           numoutputfields, ncomp, dim = 0;
781   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
782   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
783   CeedOperatorField *opinputfields, *opoutputfields;
784   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
785   CeedChkBackend(ierr);
786   CeedQFunctionField *qfinputfields, *qfoutputfields;
787   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
788   CeedChkBackend(ierr);
789   CeedEvalMode emode;
790   CeedBasis basis;
791   CeedBasis_Hip_shared *basis_data;
792   CeedElemRestriction Erestrict;
793   CeedElemRestriction_Hip *restr_data;
794 
795   // Check for restriction only identity operator
796   bool is_identity_qf;
797   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
798   if (is_identity_qf) {
799     CeedEvalMode emodein, emodeout;
800     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
801     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
802     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
803       // LCOV_EXCL_START
804       return CeedError(ceed, CEED_ERROR_BACKEND,
805                        "Backend does not implement restriction only identity operators");
806     // LCOV_EXCL_STOP
807   }
808 
809   ostringstream code;
810   string devFunctions(deviceFunctions);
811   code << devFunctions;
812 
813   string qFunction(qf_data->qFunctionSource);
814   string qFunctionName(qf_data->qFunctionName);
815   string oper;
816   oper = "CeedKernel_Hip_gen_" + qFunctionName;
817 
818   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
819   code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n";
820   code << "#define CeedPragmaSIMD\n";
821   code << "#define CEED_ERROR_SUCCESS 0\n\n";
822 
823   // Find dim and Q1d
824   bool useCollograd = true;
825   data->maxP1d = 0;
826   for (CeedInt i = 0; i < numinputfields; i++) {
827     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
828     if (basis != CEED_BASIS_COLLOCATED) {
829       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
830       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
831       CeedChkBackend(ierr);
832 
833       // Check for collocated gradient
834       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
835 
836       // Collect dim and Q1d
837       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
838       bool isTensor;
839       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
840       if (isTensor) {
841         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
842         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
843         if (P1d>data->maxP1d) data->maxP1d = P1d;
844       } else {
845         // LCOV_EXCL_START
846         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
847         // LCOV_EXCL_STOP
848         }
849     }
850   }
851   // Check output bases for Q1d, dim as well
852   //   The only imput basis might be CEED_BASIS_COLLOCATED
853   for (CeedInt i = 0; i < numoutputfields; i++) {
854     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
855 
856     if (basis != CEED_BASIS_COLLOCATED) {
857       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
858       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
859       CeedChkBackend(ierr);
860 
861       // Collect dim and Q1d
862       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
863       bool isTensor;
864       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
865       if (isTensor) {
866         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
867       } else {
868         // LCOV_EXCL_START
869         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
870         // LCOV_EXCL_STOP
871         }
872 
873       // Check for collocated gradient
874       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
875     }
876   }
877   data->dim = dim;
878   data->Q1d = Q1d;
879 
880   // Define CEED_Q_VLA
881   if (dim != 3 || useCollograd) {
882     code << "\n#define CEED_Q_VLA 1\n\n";
883   } else {
884     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
885   }
886 
887   code << qFunction;
888 
889   // Setup
890   code << "\n// -----------------------------------------------------------------------------\n";
891   code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
892   code << "__global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
893   for (CeedInt i = 0; i < numinputfields; i++) {
894     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
895     CeedChkBackend(ierr);
896     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
897       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
898     }
899   }
900 
901   for (CeedInt i = 0; i < numoutputfields; i++) {
902     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
903   }
904 
905   code << "  const CeedInt Dim = "<<dim<<";\n";
906   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
907 
908   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
909   code << "  BackendData data;\n";
910   code << "  data.tidx = threadIdx.x;\n";
911   code << "  data.tidy = threadIdx.y;\n";
912   code << "  data.tidz = threadIdx.z;\n";
913   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
914   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
915 
916   code << "\n  // -- Input field constants and basis data --\n";
917   //Initialize constants, and matrices B and G
918   for (CeedInt i = 0; i < numinputfields; i++) {
919     code << "  // ---- Input field "<<i<<" ----\n";
920     // Get elemsize, emode, ncomp
921     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
922     CeedChkBackend(ierr);
923     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
924     CeedChkBackend(ierr);
925     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
926     CeedChkBackend(ierr);
927     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
928     CeedChkBackend(ierr);
929 
930     // Set field constants
931     if (emode != CEED_EVAL_WEIGHT) {
932       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
933       if (basis != CEED_BASIS_COLLOCATED) {
934         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
935         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
936       } else {
937         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
938       }
939       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
940     }
941 
942     // Load basis data
943     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
944     switch (emode) {
945     case CEED_EVAL_NONE:
946       break;
947     case CEED_EVAL_INTERP:
948       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
949       data->B.in[i] = basis_data->d_interp_1d;
950       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
951       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
952       break;
953     case CEED_EVAL_GRAD:
954       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
955       data->B.in[i] = basis_data->d_interp_1d;
956       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
957       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
958       if (useCollograd) {
959         data->G.in[i] = basis_data->d_collo_grad_1d;
960         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
961         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
962       } else {
963         data->G.in[i] = basis_data->d_grad_1d;
964         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
965         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
966       }
967       break;
968     case CEED_EVAL_WEIGHT:
969       break; // No action
970     case CEED_EVAL_DIV:
971       break; // TODO: Not implemented
972     case CEED_EVAL_CURL:
973       break; // TODO: Not implemented
974     }
975   }
976 
977   code << "\n  // -- Output field constants and basis data --\n";
978   for (CeedInt i = 0; i < numoutputfields; i++) {
979     code << "  // ---- Output field "<<i<<" ----\n";
980     // Get elemsize, emode, ncomp
981     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
982     CeedChkBackend(ierr);
983     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
984     CeedChkBackend(ierr);
985     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
986     CeedChkBackend(ierr);
987     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
988     CeedChkBackend(ierr);
989 
990     // Set field constants
991     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
992     if (basis != CEED_BASIS_COLLOCATED) {
993       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
994       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
995     } else {
996       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
997     }
998     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
999 
1000     // Load basis data
1001     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1002     switch (emode) {
1003     case CEED_EVAL_NONE:
1004       break; // No action
1005     case CEED_EVAL_INTERP:
1006       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1007       data->B.out[i] = basis_data->d_interp_1d;
1008       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1009       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1010       break;
1011     case CEED_EVAL_GRAD:
1012       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1013       data->B.out[i] = basis_data->d_interp_1d;
1014       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1015       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1016       if (useCollograd) {
1017         data->G.out[i] = basis_data->d_collo_grad_1d;
1018         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1019         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1020       } else {
1021         data->G.out[i] = basis_data->d_grad_1d;
1022         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1023         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1024       }
1025       break;
1026     // LCOV_EXCL_START
1027     case CEED_EVAL_WEIGHT: {
1028       Ceed ceed;
1029       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1030       return CeedError(ceed, CEED_ERROR_BACKEND,
1031                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1032       break; // Should not occur
1033     }
1034     case CEED_EVAL_DIV:
1035       break; // TODO: Not implemented
1036     case CEED_EVAL_CURL:
1037       break; // TODO: Not implemented
1038       // LCOV_EXCL_STOP
1039     }
1040   }
1041   code << "\n  // -- Element loop --\n";
1042   code << "  __syncthreads();\n";
1043   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1044   // Input basis apply if needed
1045   // Generate the correct eval mode code for each input
1046   code << "    // -- Input field restrictions and basis actions --\n";
1047   for (CeedInt i = 0; i < numinputfields; i++) {
1048     code << "    // ---- Input field "<<i<<" ----\n";
1049     // Get elemsize, emode, ncomp
1050     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1051     CeedChkBackend(ierr);
1052     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1053     CeedChkBackend(ierr);
1054     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1055     CeedChkBackend(ierr);
1056     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1057     CeedChkBackend(ierr);
1058 
1059     // Restriction
1060     if (emode != CEED_EVAL_WEIGHT &&
1061         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1062       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1063 
1064       bool isStrided;
1065       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1066       if (!isStrided) {
1067         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1068         CeedChkBackend(ierr);
1069         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1070         CeedInt compstride;
1071         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1072         code << "    // CompStride: "<<compstride<<"\n";
1073         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1074         data->indices.in[i] = restr_data->d_ind;
1075         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1076       } else {
1077         bool backendstrides;
1078         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1079         CeedChkBackend(ierr);
1080         CeedInt nelem;
1081         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1082         CeedChkBackend(ierr);
1083         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1084         if (!backendstrides) {
1085           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1086           CeedChkBackend(ierr);
1087         }
1088         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1089         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1090       }
1091     }
1092 
1093     // Basis action
1094     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1095     switch (emode) {
1096     case CEED_EVAL_NONE:
1097       if (!useCollograd) {
1098         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1099       }
1100       break;
1101     case CEED_EVAL_INTERP:
1102       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1103       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1104       break;
1105     case CEED_EVAL_GRAD:
1106       if (useCollograd) {
1107         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1108         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1109       } else {
1110         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1111         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1112       }
1113       break;
1114     case CEED_EVAL_WEIGHT:
1115       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1116       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1117       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1118       data->W = basis_data->d_q_weight_1d;
1119       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1120       break; // No action
1121     case CEED_EVAL_DIV:
1122       break; // TODO: Not implemented
1123     case CEED_EVAL_CURL:
1124       break; // TODO: Not implemented
1125     }
1126   }
1127 
1128   // Q function
1129   code << "\n    // -- Output field setup --\n";
1130   for (CeedInt i = 0; i < numoutputfields; i++) {
1131       code << "\n    // ---- Output field "<<i<<" ----\n";
1132     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1133     CeedChkBackend(ierr);
1134     if (emode==CEED_EVAL_GRAD)
1135     {
1136       if (useCollograd) {
1137         //Accumulator for gradient slices
1138         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1139         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1140         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1141         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1142         code << "      }\n";
1143         code << "    }\n";
1144       } else {
1145         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1146       }
1147     }
1148     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1149     {
1150       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1151     }
1152   }
1153   // We treat quadrature points per slice in 3d to save registers
1154   if (useCollograd) {
1155     code << "\n    // Note: Collocated Gradient\n";
1156     code << "#pragma unroll\n";
1157     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1158     code << "      // -- Input fields --\n";
1159     for (CeedInt i = 0; i < numinputfields; i++) {
1160       code << "      // ---- Input field "<<i<<" ----\n";
1161       // Get elemsize, emode, ncomp
1162       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1163       CeedChkBackend(ierr);
1164       // Basis action
1165       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1166       switch (emode) {
1167       case CEED_EVAL_NONE:
1168         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1169 
1170         bool isStrided;
1171         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1172         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1173         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1174         if (!isStrided) {
1175           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1176           CeedChkBackend(ierr);
1177           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1178           CeedInt compstride;
1179           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1180           code << "      // CompStride: "<<compstride<<"\n";
1181           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1182           data->indices.in[i] = restr_data->d_ind;
1183           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1184         } else {
1185           bool backendstrides;
1186           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1187           CeedChkBackend(ierr);
1188           CeedInt nelem;
1189           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1190           CeedChkBackend(ierr);
1191           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1192           if (!backendstrides) {
1193             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1194             CeedChkBackend(ierr);
1195           }
1196           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1197           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1198         }
1199         break;
1200       case CEED_EVAL_INTERP:
1201         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1202         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1203         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1204         code << "      }\n";
1205         break;
1206       case CEED_EVAL_GRAD:
1207         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1208         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1209         break;
1210       case CEED_EVAL_WEIGHT:
1211         code << "      CeedScalar r_q"<<i<<"[1];\n";
1212         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1213         break; // No action
1214       case CEED_EVAL_DIV:
1215         break; // TODO: Not implemented
1216       case CEED_EVAL_CURL:
1217         break; // TODO: Not implemented
1218       }
1219     }
1220     code << "\n      // -- Output fields --\n";
1221     for (CeedInt i = 0; i < numoutputfields; i++) {
1222       code << "      // ---- Output field "<<i<<" ----\n";
1223       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1224       CeedChkBackend(ierr);
1225       // Basis action
1226       switch (emode) {
1227       case CEED_EVAL_NONE:
1228         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1229         break; // No action
1230       case CEED_EVAL_INTERP:
1231         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1232         break;
1233       case CEED_EVAL_GRAD:
1234         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1235         break;
1236       case CEED_EVAL_WEIGHT:
1237         break; // Should not occur
1238       case CEED_EVAL_DIV:
1239         break; // TODO: Not implemented
1240       case CEED_EVAL_CURL:
1241         break; // TODO: Not implemented
1242       }
1243     }
1244   } else {
1245     code << "\n      // Note: No Collocated Gradient\n";
1246     code << "      // -- Input fields --\n";
1247     for (CeedInt i = 0; i < numinputfields; i++) {
1248       code << "      // ---- Input field "<<i<<" ----\n";
1249       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1250     }
1251     code << "      // -- Output fields --\n";
1252     for (CeedInt i = 0; i < numoutputfields; i++) {
1253       code << "      // ---- Output field "<<i<<" ----\n";
1254       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1255     }
1256   }
1257   code << "\n      // -- QFunction Inputs and outputs --\n";
1258   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1259   for (CeedInt i = 0; i < numinputfields; i++) {
1260     code << "      // ---- Input field "<<i<<" ----\n";
1261     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1262   }
1263   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1264   for (CeedInt i = 0; i < numoutputfields; i++) {
1265     code << "      // ---- Output field "<<i<<" ----\n";
1266     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1267   }
1268   code << "\n      // -- Apply QFunction --\n";
1269   code << "      "<<qFunctionName<<"(ctx, ";
1270   if (dim != 3 || useCollograd) {
1271     code << "1";
1272   } else {
1273     code << "Q1d";
1274   }
1275   code << ", in, out);\n";
1276   if (useCollograd) {
1277     code << "\n      // Note: Collocated Gradient\n";
1278     code << "      // -- Output fields --\n";
1279     for (CeedInt i = 0; i < numoutputfields; i++) {
1280       code << "      // ---- Output field "<<i<<" ----\n";
1281       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1282       CeedChkBackend(ierr);
1283       // Basis action
1284       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1285       switch (emode) {
1286       case CEED_EVAL_NONE:
1287         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1288         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1289         code << "      }\n";
1290         break; // No action
1291       case CEED_EVAL_INTERP:
1292         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1293         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1294         code << "      }\n";
1295         break;
1296       case CEED_EVAL_GRAD:
1297         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1298         break;
1299       case CEED_EVAL_WEIGHT:
1300         break; // Should not occur
1301       case CEED_EVAL_DIV:
1302         break; // TODO: Not implemented
1303       case CEED_EVAL_CURL:
1304         break; // TODO: Not implemented
1305       }
1306     }
1307     code << "    }\n";
1308   }
1309 
1310   // Output basis apply if needed
1311   // Generate the correct eval mode code for each output
1312   code << "\n    // -- Output field basis action and restrictions --\n";
1313   for (CeedInt i = 0; i < numoutputfields; i++) {
1314     code << "    // ---- Output field "<<i<<" ----\n";
1315     // Get elemsize, emode, ncomp
1316     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1317     CeedChkBackend(ierr);
1318     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1319     CeedChkBackend(ierr);
1320     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1321     CeedChkBackend(ierr);
1322     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1323     CeedChkBackend(ierr);
1324     // Basis action
1325     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1326     switch (emode) {
1327     case CEED_EVAL_NONE:
1328       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1329       break; // No action
1330     case CEED_EVAL_INTERP:
1331       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1332       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1333       break;
1334     case CEED_EVAL_GRAD:
1335       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1336       if (useCollograd) {
1337         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1338       } else {
1339         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1340       }
1341       break;
1342     // LCOV_EXCL_START
1343     case CEED_EVAL_WEIGHT: {
1344       Ceed ceed;
1345       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1346       return CeedError(ceed, CEED_ERROR_BACKEND,
1347                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1348       break; // Should not occur
1349     }
1350     case CEED_EVAL_DIV:
1351       break; // TODO: Not implemented
1352     case CEED_EVAL_CURL:
1353       break; // TODO: Not implemented
1354       // LCOV_EXCL_STOP
1355     }
1356     // Restriction
1357       bool isStrided;
1358       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1359     if (!isStrided) {
1360       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1361       CeedChkBackend(ierr);
1362       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1363       CeedInt compstride;
1364       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1365       code << "    // CompStride: "<<compstride<<"\n";
1366       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1367       data->indices.out[i] = restr_data->d_ind;
1368       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1369     } else {
1370       bool backendstrides;
1371       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1372       CeedChkBackend(ierr);
1373       CeedInt nelem;
1374       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1375       CeedChkBackend(ierr);
1376       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1377       if (!backendstrides) {
1378         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1379         CeedChkBackend(ierr);
1380       }
1381       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1382       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1383     }
1384   }
1385 
1386   code << "  }\n";
1387   code << "}\n";
1388   code << "// -----------------------------------------------------------------------------\n\n";
1389 
1390   // View kernel for debugging
1391   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
1392   CeedDebug(ceed, code.str().c_str());
1393 
1394   CeedInt block_sizes[3] = {0, 0, 0};
1395   ierr = BlockGridCalculate_Hip_gen(dim, numelements, data->maxP1d, Q1d, block_sizes);
1396   CeedChkBackend(ierr);
1397   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2,
1398                          "T1d", block_sizes[0],
1399                          "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]);
1400   CeedChkBackend(ierr);
1401   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1402   CeedChkBackend(ierr);
1403 
1404   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1405   return CEED_ERROR_SUCCESS;
1406 }
1407 //------------------------------------------------------------------------------
1408