xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 2288fb5222bbca88523f94a06377dc76b8b46264)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed/ceed.h>
11 #include <ceed/backend.h>
12 #include <iostream>
13 #include <string>
14 #include <sstream>
15 #include "ceed-hip-gen.h"
16 #include "../hip-ref/ceed-hip-ref.h"
17 #include "../hip-shared/ceed-hip-shared.h"
18 #include "../hip/ceed-hip-compile.h"
19 
20 
21 //------------------------------------------------------------------------------
22 // Calculate the block size used for launching the operator kernel
23 //------------------------------------------------------------------------------
24 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt nelem,
25      	                                  const CeedInt P1d, const CeedInt Q1d,
26 				          CeedInt *block_sizes) {
27 
28   const CeedInt thread1d = CeedIntMax(Q1d, P1d);
29   if (dim==1) {
30     CeedInt elemsPerBlock = 64*thread1d > 256? 256/thread1d : 64;
31     elemsPerBlock = elemsPerBlock>0?elemsPerBlock:1;
32     block_sizes[0] = thread1d;
33     block_sizes[1] = 1;
34     block_sizes[2] = elemsPerBlock;
35   } else if (dim==2) {
36     const CeedInt elemsPerBlock = thread1d<4? 16 : 2;
37     block_sizes[0] = thread1d;
38     block_sizes[1] = thread1d;
39     block_sizes[2] = elemsPerBlock;
40   } else if (dim==3) {
41     const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1);
42     block_sizes[0] = thread1d;
43     block_sizes[1] = thread1d;
44     block_sizes[2] = elemsPerBlock;
45   }
46   return CEED_ERROR_SUCCESS;
47 }
48 
49 static const char *deviceFunctions = QUOTE(
50 
51 //------------------------------------------------------------------------------
52 // Typedefs
53 //------------------------------------------------------------------------------
54 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
55 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
56 
57 typedef struct {
58   CeedInt tidx;
59   CeedInt tidy;
60   CeedInt tidz;
61   CeedInt tid;
62   CeedScalar* slice;
63 } BackendData;
64 
65 //------------------------------------------------------------------------------
66 // Load matrices for basis actions
67 //------------------------------------------------------------------------------
68 template <int P, int Q>
69 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
70   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
71     B[i] = d_B[i];
72 }
73 
74 //------------------------------------------------------------------------------
75 // 1D
76 //------------------------------------------------------------------------------
77 
78 //------------------------------------------------------------------------------
79 // L-vector -> E-vector, offsets provided
80 //------------------------------------------------------------------------------
81 template <int NCOMP, int COMPSTRIDE, int P1d>
82 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
83   if (data.tidx < P1d) {
84     const CeedInt node = data.tidx;
85     const CeedInt ind = indices[node + elem * P1d];
86     for (CeedInt comp = 0; comp < NCOMP; ++comp)
87       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // L-vector -> E-vector, strided
93 //------------------------------------------------------------------------------
94 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
95 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
96   if (data.tidx < P1d) {
97     const CeedInt node = data.tidx;
98     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
99     for (CeedInt comp = 0; comp < NCOMP; ++comp)
100       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
101   }
102 }
103 
104 //------------------------------------------------------------------------------
105 // E-vector -> L-vector, offsets provided
106 //------------------------------------------------------------------------------
107 template <int NCOMP, int COMPSTRIDE, int P1d>
108 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
109   if (data.tidx < P1d) {
110     const CeedInt node = data.tidx;
111     const CeedInt ind = indices[node + elem * P1d];
112     for (CeedInt comp = 0; comp < NCOMP; ++comp)
113       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
114   }
115 }
116 
117 //------------------------------------------------------------------------------
118 // E-vector -> L-vector, strided
119 //------------------------------------------------------------------------------
120 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
121 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
122   if (data.tidx < P1d) {
123     const CeedInt node = data.tidx;
124     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
125     for (CeedInt comp = 0; comp < NCOMP; ++comp)
126       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
127   }
128 }
129 
130 //------------------------------------------------------------------------------
131 // 1D tensor contraction x
132 //------------------------------------------------------------------------------
133 template <int NCOMP, int P1d, int Q1d>
134 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
135   data.slice[data.tidx] = *U;
136   __syncthreads();
137   *V = 0.0;
138   if (data.tidx < Q1d)
139     for (CeedInt i = 0; i < P1d; ++i)
140       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
141   __syncthreads();
142 }
143 
144 //------------------------------------------------------------------------------
145 // 1D transpose tensor contraction x
146 //------------------------------------------------------------------------------
147 template <int NCOMP, int P1d, int Q1d>
148 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
149   data.slice[data.tidx] = *U;
150   __syncthreads();
151   *V = 0.0;
152   if (data.tidx < P1d)
153     for (CeedInt i = 0; i < Q1d; ++i)
154       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
155   __syncthreads();
156 }
157 
158 //------------------------------------------------------------------------------
159 // 1D interpolate to quadrature points
160 //------------------------------------------------------------------------------
161 template <int NCOMP, int P1d, int Q1d>
162 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
163   for (CeedInt comp = 0; comp < NCOMP; comp++)
164     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
165 }
166 
167 //------------------------------------------------------------------------------
168 // 1D interpolate transpose
169 //------------------------------------------------------------------------------
170 template <int NCOMP, int P1d, int Q1d>
171 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
172   for (CeedInt comp=0; comp<NCOMP; comp++)
173     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
174 }
175 
176 //------------------------------------------------------------------------------
177 // 1D derivatives at quadrature points
178 //------------------------------------------------------------------------------
179 template <int NCOMP, int P1d, int Q1d>
180 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
181   for (CeedInt comp = 0; comp < NCOMP; comp++)
182     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
183 }
184 
185 //------------------------------------------------------------------------------
186 // 1D derivatives transpose
187 //------------------------------------------------------------------------------
188 template <int NCOMP, int P1d, int Q1d>
189 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
190   for (CeedInt comp = 0; comp < NCOMP; comp++)
191     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
192 }
193 
194 //------------------------------------------------------------------------------
195 // 2D
196 //------------------------------------------------------------------------------
197 
198 //------------------------------------------------------------------------------
199 // L-vector -> E-vector, offsets provided
200 //------------------------------------------------------------------------------
201 template <int NCOMP, int COMPSTRIDE, int P1d>
202 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
203   if (data.tidx < P1d && data.tidy < P1d) {
204     const CeedInt node = data.tidx + data.tidy*P1d;
205     const CeedInt ind = indices[node + elem * P1d*P1d];
206     for (CeedInt comp = 0; comp < NCOMP; ++comp)
207       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
208   }
209 }
210 
211 //------------------------------------------------------------------------------
212 // L-vector -> E-vector, strided
213 //------------------------------------------------------------------------------
214 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
215 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
216   if (data.tidx < P1d && data.tidy < P1d) {
217     const CeedInt node = data.tidx + data.tidy*P1d;
218     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
219     for (CeedInt comp = 0; comp < NCOMP; ++comp)
220       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
221   }
222 }
223 
224 //------------------------------------------------------------------------------
225 // E-vector -> L-vector, offsets provided
226 //------------------------------------------------------------------------------
227 template <int NCOMP, int COMPSTRIDE, int P1d>
228 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
229   if (data.tidx < P1d && data.tidy < P1d) {
230     const CeedInt node = data.tidx + data.tidy*P1d;
231     const CeedInt ind = indices[node + elem * P1d*P1d];
232     for (CeedInt comp = 0; comp < NCOMP; ++comp)
233       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
234   }
235 }
236 
237 //------------------------------------------------------------------------------
238 // E-vector -> L-vector, strided
239 //------------------------------------------------------------------------------
240 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
241 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
242   if (data.tidx < P1d && data.tidy < P1d) {
243     const CeedInt node = data.tidx + data.tidy*P1d;
244     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
245     for (CeedInt comp = 0; comp < NCOMP; ++comp)
246       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
247   }
248 }
249 
250 //------------------------------------------------------------------------------
251 // 2D tensor contraction x
252 //------------------------------------------------------------------------------
253 template <int NCOMP, int P1d, int Q1d>
254 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
255   data.slice[data.tidx+data.tidy*T1d] = *U;
256   __syncthreads();
257   *V = 0.0;
258   if (data.tidx < Q1d && data.tidy < P1d)
259     for (CeedInt i = 0; i < P1d; ++i)
260       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
261   __syncthreads();
262 }
263 
264 //------------------------------------------------------------------------------
265 // 2D tensor contract y
266 //------------------------------------------------------------------------------
267 template <int NCOMP, int P1d, int Q1d>
268 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
269   data.slice[data.tidx+data.tidy*T1d] = *U;
270   __syncthreads();
271   *V = 0.0;
272   if (data.tidx < Q1d && data.tidy < Q1d)
273     for (CeedInt i = 0; i < P1d; ++i)
274       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
275   __syncthreads();
276 }
277 
278 //------------------------------------------------------------------------------
279 // 2D transpose tensor contract y
280 //------------------------------------------------------------------------------
281 template <int NCOMP, int P1d, int Q1d>
282 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
283   data.slice[data.tidx+data.tidy*T1d] = *U;
284   __syncthreads();
285   *V = 0.0;
286   if (data.tidx < Q1d && data.tidy < P1d)
287     for (CeedInt i = 0; i < Q1d; ++i)
288       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
289   __syncthreads();
290 }
291 
292 //------------------------------------------------------------------------------
293 // 2D transpose tensor contract x
294 //------------------------------------------------------------------------------
295 template <int NCOMP, int P1d, int Q1d>
296 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
297   data.slice[data.tidx+data.tidy*T1d] = *U;
298   __syncthreads();
299   *V = 0.0;
300   if (data.tidx < P1d && data.tidy < P1d)
301     for (CeedInt i = 0; i < Q1d; ++i)
302       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
303   __syncthreads();
304 }
305 
306 //------------------------------------------------------------------------------
307 // 2D transpose tensor contract and add x
308 //------------------------------------------------------------------------------
309 template <int NCOMP, int P1d, int Q1d>
310 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
311   data.slice[data.tidx+data.tidy*T1d] = *U;
312   __syncthreads();
313   if (data.tidx < P1d && data.tidy < P1d)
314     for (CeedInt i = 0; i < Q1d; ++i)
315       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
316   __syncthreads();
317 }
318 
319 //------------------------------------------------------------------------------
320 // 2D interpolate to quadrature points
321 //------------------------------------------------------------------------------
322 template <int NCOMP, int P1d, int Q1d>
323 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
324   CeedScalar r_t[1];
325   for (CeedInt comp = 0; comp < NCOMP; comp++) {
326     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
327     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
328   }
329 }
330 
331 //------------------------------------------------------------------------------
332 // 2D interpolate transpose
333 //------------------------------------------------------------------------------
334 template <int NCOMP, int P1d, int Q1d>
335 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
336   CeedScalar r_t[1];
337   for (CeedInt comp = 0; comp < NCOMP; comp++) {
338     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
339     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
340   }
341 }
342 
343 //------------------------------------------------------------------------------
344 // 2D derivatives at quadrature points
345 //------------------------------------------------------------------------------
346 template <int NCOMP, int P1d, int Q1d>
347 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
348   CeedScalar r_t[1];
349   for (CeedInt comp = 0; comp < NCOMP; comp++) {
350     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
351     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
352     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
353     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
354   }
355 }
356 
357 //------------------------------------------------------------------------------
358 // 2D derivatives transpose
359 //------------------------------------------------------------------------------
360 template <int NCOMP, int P1d, int Q1d>
361 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
362   CeedScalar r_t[1];
363   for (CeedInt comp = 0; comp < NCOMP; comp++) {
364     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
365     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
366     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
367     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
368   }
369 }
370 
371 //------------------------------------------------------------------------------
372 // 3D
373 //------------------------------------------------------------------------------
374 
375 //------------------------------------------------------------------------------
376 // L-vector -> E-vector, offsets provided
377 //------------------------------------------------------------------------------
378 template <int NCOMP, int COMPSTRIDE, int P1d>
379 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
380   if (data.tidx < P1d && data.tidy < P1d)
381     for (CeedInt z = 0; z < P1d; ++z) {
382       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
383       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
384       for (CeedInt comp = 0; comp < NCOMP; ++comp)
385         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
386     }
387 }
388 
389 //------------------------------------------------------------------------------
390 // L-vector -> E-vector, strided
391 //------------------------------------------------------------------------------
392 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
393 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
394   if (data.tidx < P1d && data.tidy < P1d)
395     for (CeedInt z = 0; z < P1d; ++z) {
396       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
397       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
398       for (CeedInt comp = 0; comp < NCOMP; ++comp)
399         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
400     }
401 }
402 
403 //------------------------------------------------------------------------------
404 // E-vector -> Q-vector, offests provided
405 //------------------------------------------------------------------------------
406 template <int NCOMP, int COMPSTRIDE, int Q1d>
407 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
408   if (data.tidx < Q1d && data.tidy < Q1d) {
409     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
410     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
411     for (CeedInt comp = 0; comp < NCOMP; ++comp)
412       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
413   }
414 }
415 
416 //------------------------------------------------------------------------------
417 // E-vector -> Q-vector, strided
418 //------------------------------------------------------------------------------
419 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
420 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
421   if (data.tidx < Q1d && data.tidy < Q1d) {
422     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
423     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
424     for (CeedInt comp = 0; comp < NCOMP; ++comp)
425       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
426   }
427 }
428 
429 //------------------------------------------------------------------------------
430 // E-vector -> L-vector, offsets provided
431 //------------------------------------------------------------------------------
432 template <int NCOMP, int COMPSTRIDE, int P1d>
433 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
434   if (data.tidx < P1d && data.tidy < P1d)
435     for (CeedInt z = 0; z < P1d; ++z) {
436       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
437       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
438       for (CeedInt comp = 0; comp < NCOMP; ++comp)
439         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
440     }
441 }
442 
443 //------------------------------------------------------------------------------
444 // E-vector -> L-vector, strided
445 //------------------------------------------------------------------------------
446 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
447 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
448   if (data.tidx < P1d && data.tidy < P1d)
449     for (CeedInt z = 0; z < P1d; ++z) {
450       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
451       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
452       for (CeedInt comp = 0; comp < NCOMP; ++comp)
453         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
454     }
455 }
456 
457 //------------------------------------------------------------------------------
458 // 3D tensor contract x
459 //------------------------------------------------------------------------------
460 template <int NCOMP, int P1d, int Q1d>
461 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
462   CeedScalar r_B[P1d];
463   for (CeedInt i = 0; i < P1d; ++i)
464     r_B[i] = B[i + data.tidx*P1d];
465 
466   for (CeedInt k = 0; k < P1d; ++k) {
467     data.slice[data.tidx+data.tidy*T1d] = U[k];
468     __syncthreads();
469     V[k] = 0.0;
470     if (data.tidx < Q1d && data.tidy < P1d)
471       for (CeedInt i = 0; i < P1d; ++i)
472         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
473     __syncthreads();
474   }
475 }
476 
477 //------------------------------------------------------------------------------
478 // 3D tensor contract y
479 //------------------------------------------------------------------------------
480 template <int NCOMP, int P1d, int Q1d>
481 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
482   CeedScalar r_B[P1d];
483   for (CeedInt i = 0; i < P1d; ++i)
484     r_B[i] = B[i + data.tidy*P1d];
485 
486   for (CeedInt k = 0; k < P1d; ++k) {
487     data.slice[data.tidx+data.tidy*T1d] = U[k];
488     __syncthreads();
489     V[k] = 0.0;
490     if (data.tidx < Q1d && data.tidy < Q1d)
491       for (CeedInt i = 0; i < P1d; ++i)
492         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
493     __syncthreads();
494   }
495 }
496 
497 //------------------------------------------------------------------------------
498 // 3D tensor contract z
499 //------------------------------------------------------------------------------
500 template <int NCOMP, int P1d, int Q1d>
501 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
502   for (CeedInt k = 0; k < Q1d; ++k) {
503     V[k] = 0.0;
504     if (data.tidx < Q1d && data.tidy < Q1d)
505       for (CeedInt i = 0; i < P1d; ++i)
506         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
507   }
508 }
509 
510 //------------------------------------------------------------------------------
511 // 3D transpose tensor contract z
512 //------------------------------------------------------------------------------
513 template <int NCOMP, int P1d, int Q1d>
514 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
515   for (CeedInt k = 0; k < P1d; ++k) {
516     V[k] = 0.0;
517     if (data.tidx < Q1d && data.tidy < Q1d)
518       for (CeedInt i = 0; i < Q1d; ++i)
519         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
520   }
521 }
522 
523 //------------------------------------------------------------------------------
524 // 3D transpose tensor contract y
525 //------------------------------------------------------------------------------
526 template <int NCOMP, int P1d, int Q1d>
527 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
528   CeedScalar r_B[Q1d];
529   for (CeedInt i = 0; i < Q1d; ++i)
530     r_B[i] = B[data.tidy + i*P1d];
531 
532   for (CeedInt k = 0; k < P1d; ++k) {
533     data.slice[data.tidx+data.tidy*T1d] = U[k];
534     __syncthreads();
535     V[k] = 0.0;
536     if (data.tidx < Q1d && data.tidy < P1d)
537       for (CeedInt i = 0; i < Q1d; ++i)
538         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
539     __syncthreads();
540   }
541 }
542 
543 //------------------------------------------------------------------------------
544 // 3D transpose tensor contract add y
545 //------------------------------------------------------------------------------
546 template <int NCOMP, int P1d, int Q1d>
547 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
548   CeedScalar r_B[Q1d];
549   for (CeedInt i = 0; i < Q1d; ++i)
550     r_B[i] = B[data.tidy + i*P1d];
551 
552   for (CeedInt k = 0; k < P1d; ++k) {
553     data.slice[data.tidx+data.tidy*T1d] = U[k];
554     __syncthreads();
555     if (data.tidx < Q1d && data.tidy < P1d)
556       for (CeedInt i = 0; i < Q1d; ++i)
557         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
558     __syncthreads();
559   }
560 }
561 
562 //------------------------------------------------------------------------------
563 // 3D transpose tensor contract x
564 //------------------------------------------------------------------------------
565 template <int NCOMP, int P1d, int Q1d>
566 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
567   CeedScalar r_B[Q1d];
568   for (CeedInt i = 0; i < Q1d; ++i)
569     r_B[i] = B[data.tidx + i*P1d];
570 
571   for (CeedInt k = 0; k < P1d; ++k) {
572     data.slice[data.tidx+data.tidy*T1d] = U[k];
573     __syncthreads();
574     V[k] = 0.0;
575     if (data.tidx < P1d && data.tidy < P1d)
576       for (CeedInt i = 0; i < Q1d; ++i)
577         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
578     __syncthreads();
579   }
580 }
581 
582 //------------------------------------------------------------------------------
583 // 3D transpose tensor contract add x
584 //------------------------------------------------------------------------------
585 template <int NCOMP, int P1d, int Q1d>
586 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
587   CeedScalar r_B[Q1d];
588   for (CeedInt i = 0; i < Q1d; ++i)
589     r_B[i] = B[data.tidx + i*P1d];
590 
591   for (CeedInt k = 0; k < P1d; ++k) {
592     data.slice[data.tidx+data.tidy*T1d] = U[k];
593     __syncthreads();
594     if (data.tidx < P1d && data.tidy < P1d)
595       for (CeedInt i = 0; i < Q1d; ++i)
596         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
597     __syncthreads();
598   }
599 }
600 
601 //------------------------------------------------------------------------------
602 // 3D interpolate to quadrature points
603 //------------------------------------------------------------------------------
604 template <int NCOMP, int P1d, int Q1d>
605 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
606   CeedScalar r_t1[T1d];
607   CeedScalar r_t2[T1d];
608   for (CeedInt comp = 0; comp < NCOMP; comp++) {
609     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
610     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
611     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
612   }
613 }
614 
615 //------------------------------------------------------------------------------
616 // 3D interpolate transpose
617 //------------------------------------------------------------------------------
618 template <int NCOMP, int P1d, int Q1d>
619 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
620   CeedScalar r_t1[T1d];
621   CeedScalar r_t2[T1d];
622   for (CeedInt comp = 0; comp < NCOMP; comp++) {
623     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
624     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
625     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
626   }
627 }
628 
629 //------------------------------------------------------------------------------
630 // 3D derivatives at quadrature points
631 //------------------------------------------------------------------------------
632 template <int NCOMP, int P1d, int Q1d>
633 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
634   CeedScalar r_t1[T1d];
635   CeedScalar r_t2[T1d];
636   for (CeedInt comp = 0; comp < NCOMP; comp++) {
637     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
638     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
639     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
640     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
641     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
642     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
643     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
644     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
645     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
646   }
647 }
648 
649 //------------------------------------------------------------------------------
650 // 3D derivatives transpose
651 //------------------------------------------------------------------------------
652 template <int NCOMP, int P1d, int Q1d>
653 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
654   CeedScalar r_t1[T1d];
655   CeedScalar r_t2[T1d];
656   for (CeedInt comp = 0; comp < NCOMP; comp++) {
657     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
658     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
659     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
660     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
661     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
662     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
663     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
664     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
665     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
666   }
667 }
668 
669 //------------------------------------------------------------------------------
670 // 3D collocated derivatives computation
671 //------------------------------------------------------------------------------
672 template <int NCOMP, int Q1d>
673 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
674   if (data.tidx < Q1d && data.tidy < Q1d) {
675     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
676       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
677       __syncthreads();
678       // X derivative
679       r_V[comp+0*NCOMP] = 0.0;
680       for (CeedInt i = 0; i < Q1d; ++i)
681         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
682       // Y derivative
683       r_V[comp+1*NCOMP] = 0.0;
684       for (CeedInt i = 0; i < Q1d; ++i)
685         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
686       // Z derivative
687       r_V[comp+2*NCOMP] = 0.0;
688       for (CeedInt i = 0; i < Q1d; ++i)
689         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
690       __syncthreads();
691     }
692   }
693 }
694 
695 //------------------------------------------------------------------------------
696 // 3D collocated derivatives transpose
697 //------------------------------------------------------------------------------
698 template <int NCOMP, int Q1d>
699 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
700   if (data.tidx < Q1d && data.tidy < Q1d) {
701     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
702       // X derivative
703       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
704       __syncthreads();
705       for (CeedInt i = 0; i < Q1d; ++i)
706         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
707       __syncthreads();
708       // Y derivative
709       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
710       __syncthreads();
711       for (CeedInt i = 0; i < Q1d; ++i)
712         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
713       __syncthreads();
714       // Z derivative
715       for (CeedInt i = 0; i < Q1d; ++i)
716         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
717     }
718   }
719 }
720 
721 //------------------------------------------------------------------------------
722 // 1D quadrature weights
723 //------------------------------------------------------------------------------
724 template <int Q1d>
725 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
726   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
727 }
728 
729 //------------------------------------------------------------------------------
730 // 2D quadrature weights
731 //------------------------------------------------------------------------------
732 template <int Q1d>
733 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
734   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
735         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
736 }
737 
738 //------------------------------------------------------------------------------
739 // 3D quadrature weights
740 //------------------------------------------------------------------------------
741 template <int Q1d>
742 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
743   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
744   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
745   for (CeedInt z = 0; z < Q1d; ++z)
746     w[z] = quad ? pw*qweight1d[z] : 0.0;
747 }
748 
749 );
750 //------------------------------------------------------------------------------
751 // Build singe operator kernel
752 //------------------------------------------------------------------------------
753 extern "C" int CeedHipGenOperatorBuild(CeedOperator op) {
754 
755   using std::ostringstream;
756   using std::string;
757   int ierr;
758   bool setupdone;
759   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
760   if (setupdone) return CEED_ERROR_SUCCESS;
761   Ceed ceed;
762   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
763   CeedOperator_Hip_gen *data;
764   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
765   CeedQFunction qf;
766   CeedQFunction_Hip_gen *qf_data;
767   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
768   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
769   CeedSize lsize;
770   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
771           numoutputfields, ncomp, dim = 0;
772   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
773   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
774   CeedOperatorField *opinputfields, *opoutputfields;
775   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
776   CeedChkBackend(ierr);
777   CeedQFunctionField *qfinputfields, *qfoutputfields;
778   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
779   CeedChkBackend(ierr);
780   CeedEvalMode emode;
781   CeedBasis basis;
782   CeedBasis_Hip_shared *basis_data;
783   CeedElemRestriction Erestrict;
784   CeedElemRestriction_Hip *restr_data;
785 
786   // Check for restriction only identity operator
787   bool is_identity_qf;
788   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
789   if (is_identity_qf) {
790     CeedEvalMode emodein, emodeout;
791     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
792     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
793     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
794       // LCOV_EXCL_START
795       return CeedError(ceed, CEED_ERROR_BACKEND,
796                        "Backend does not implement restriction only identity operators");
797     // LCOV_EXCL_STOP
798   }
799 
800   ostringstream code;
801   string devFunctions(deviceFunctions);
802   code << devFunctions;
803 
804   string qFunction(qf_data->qFunctionSource);
805   string qFunctionName(qf_data->qFunctionName);
806   string oper;
807   oper = "CeedKernel_Hip_gen_" + qFunctionName;
808 
809   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
810   code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n";
811   code << "#define CeedPragmaSIMD\n";
812   code << "#define CEED_ERROR_SUCCESS 0\n\n";
813 
814   // Find dim and Q1d
815   bool useCollograd = true;
816   data->maxP1d = 0;
817   for (CeedInt i = 0; i < numinputfields; i++) {
818     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
819     if (basis != CEED_BASIS_COLLOCATED) {
820       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
821       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
822       CeedChkBackend(ierr);
823 
824       // Check for collocated gradient
825       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
826 
827       // Collect dim and Q1d
828       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
829       bool isTensor;
830       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
831       if (isTensor) {
832         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
833         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
834         if (P1d>data->maxP1d) data->maxP1d = P1d;
835       } else {
836         // LCOV_EXCL_START
837         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
838         // LCOV_EXCL_STOP
839         }
840     }
841   }
842   // Check output bases for Q1d, dim as well
843   //   The only imput basis might be CEED_BASIS_COLLOCATED
844   for (CeedInt i = 0; i < numoutputfields; i++) {
845     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
846 
847     if (basis != CEED_BASIS_COLLOCATED) {
848       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
849       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
850       CeedChkBackend(ierr);
851 
852       // Collect dim and Q1d
853       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
854       bool isTensor;
855       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
856       if (isTensor) {
857         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
858       } else {
859         // LCOV_EXCL_START
860         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
861         // LCOV_EXCL_STOP
862         }
863 
864       // Check for collocated gradient
865       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
866     }
867   }
868   data->dim = dim;
869   data->Q1d = Q1d;
870 
871   // Define CEED_Q_VLA
872   if (dim != 3 || useCollograd) {
873     code << "\n#define CEED_Q_VLA 1\n\n";
874   } else {
875     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
876   }
877 
878   code << qFunction;
879 
880   // Setup
881   code << "\n// -----------------------------------------------------------------------------\n";
882   code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
883   code << "__global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
884   for (CeedInt i = 0; i < numinputfields; i++) {
885     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
886     CeedChkBackend(ierr);
887     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
888       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
889     }
890   }
891 
892   for (CeedInt i = 0; i < numoutputfields; i++) {
893     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
894   }
895 
896   code << "  const CeedInt Dim = "<<dim<<";\n";
897   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
898 
899   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
900   code << "  BackendData data;\n";
901   code << "  data.tidx = threadIdx.x;\n";
902   code << "  data.tidy = threadIdx.y;\n";
903   code << "  data.tidz = threadIdx.z;\n";
904   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
905   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
906 
907   code << "\n  // -- Input field constants and basis data --\n";
908   //Initialize constants, and matrices B and G
909   for (CeedInt i = 0; i < numinputfields; i++) {
910     code << "  // ---- Input field "<<i<<" ----\n";
911     // Get elemsize, emode, ncomp
912     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
913     CeedChkBackend(ierr);
914     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
915     CeedChkBackend(ierr);
916     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
917     CeedChkBackend(ierr);
918     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
919     CeedChkBackend(ierr);
920 
921     // Set field constants
922     if (emode != CEED_EVAL_WEIGHT) {
923       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
924       if (basis != CEED_BASIS_COLLOCATED) {
925         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
926         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
927       } else {
928         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
929       }
930       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
931     }
932 
933     // Load basis data
934     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
935     switch (emode) {
936     case CEED_EVAL_NONE:
937       break;
938     case CEED_EVAL_INTERP:
939       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
940       data->B.in[i] = basis_data->d_interp_1d;
941       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
942       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
943       break;
944     case CEED_EVAL_GRAD:
945       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
946       data->B.in[i] = basis_data->d_interp_1d;
947       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
948       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
949       if (useCollograd) {
950         data->G.in[i] = basis_data->d_collo_grad_1d;
951         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
952         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
953       } else {
954         data->G.in[i] = basis_data->d_grad_1d;
955         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
956         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
957       }
958       break;
959     case CEED_EVAL_WEIGHT:
960       break; // No action
961     case CEED_EVAL_DIV:
962       break; // TODO: Not implemented
963     case CEED_EVAL_CURL:
964       break; // TODO: Not implemented
965     }
966   }
967 
968   code << "\n  // -- Output field constants and basis data --\n";
969   for (CeedInt i = 0; i < numoutputfields; i++) {
970     code << "  // ---- Output field "<<i<<" ----\n";
971     // Get elemsize, emode, ncomp
972     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
973     CeedChkBackend(ierr);
974     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
975     CeedChkBackend(ierr);
976     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
977     CeedChkBackend(ierr);
978     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
979     CeedChkBackend(ierr);
980 
981     // Set field constants
982     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
983     if (basis != CEED_BASIS_COLLOCATED) {
984       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
985       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
986     } else {
987       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
988     }
989     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
990 
991     // Load basis data
992     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
993     switch (emode) {
994     case CEED_EVAL_NONE:
995       break; // No action
996     case CEED_EVAL_INTERP:
997       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
998       data->B.out[i] = basis_data->d_interp_1d;
999       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1000       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1001       break;
1002     case CEED_EVAL_GRAD:
1003       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1004       data->B.out[i] = basis_data->d_interp_1d;
1005       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1006       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1007       if (useCollograd) {
1008         data->G.out[i] = basis_data->d_collo_grad_1d;
1009         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1010         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1011       } else {
1012         data->G.out[i] = basis_data->d_grad_1d;
1013         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1014         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1015       }
1016       break;
1017     // LCOV_EXCL_START
1018     case CEED_EVAL_WEIGHT: {
1019       Ceed ceed;
1020       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1021       return CeedError(ceed, CEED_ERROR_BACKEND,
1022                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1023       break; // Should not occur
1024     }
1025     case CEED_EVAL_DIV:
1026       break; // TODO: Not implemented
1027     case CEED_EVAL_CURL:
1028       break; // TODO: Not implemented
1029       // LCOV_EXCL_STOP
1030     }
1031   }
1032   code << "\n  // -- Element loop --\n";
1033   code << "  __syncthreads();\n";
1034   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1035   // Input basis apply if needed
1036   // Generate the correct eval mode code for each input
1037   code << "    // -- Input field restrictions and basis actions --\n";
1038   for (CeedInt i = 0; i < numinputfields; i++) {
1039     code << "    // ---- Input field "<<i<<" ----\n";
1040     // Get elemsize, emode, ncomp
1041     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1042     CeedChkBackend(ierr);
1043     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1044     CeedChkBackend(ierr);
1045     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1046     CeedChkBackend(ierr);
1047     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1048     CeedChkBackend(ierr);
1049 
1050     // Restriction
1051     if (emode != CEED_EVAL_WEIGHT &&
1052         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1053       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1054 
1055       bool isStrided;
1056       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1057       if (!isStrided) {
1058         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1059         CeedChkBackend(ierr);
1060         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1061         CeedInt compstride;
1062         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1063         code << "    // CompStride: "<<compstride<<"\n";
1064         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1065         data->indices.in[i] = restr_data->d_ind;
1066         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1067       } else {
1068         bool backendstrides;
1069         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1070         CeedChkBackend(ierr);
1071         CeedInt nelem;
1072         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1073         CeedChkBackend(ierr);
1074         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1075         if (!backendstrides) {
1076           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1077           CeedChkBackend(ierr);
1078         }
1079         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1080         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1081       }
1082     }
1083 
1084     // Basis action
1085     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1086     switch (emode) {
1087     case CEED_EVAL_NONE:
1088       if (!useCollograd) {
1089         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1090       }
1091       break;
1092     case CEED_EVAL_INTERP:
1093       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1094       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1095       break;
1096     case CEED_EVAL_GRAD:
1097       if (useCollograd) {
1098         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1099         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1100       } else {
1101         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1102         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1103       }
1104       break;
1105     case CEED_EVAL_WEIGHT:
1106       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1107       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1108       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1109       data->W = basis_data->d_q_weight_1d;
1110       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1111       break; // No action
1112     case CEED_EVAL_DIV:
1113       break; // TODO: Not implemented
1114     case CEED_EVAL_CURL:
1115       break; // TODO: Not implemented
1116     }
1117   }
1118 
1119   // Q function
1120   code << "\n    // -- Output field setup --\n";
1121   for (CeedInt i = 0; i < numoutputfields; i++) {
1122       code << "\n    // ---- Output field "<<i<<" ----\n";
1123     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1124     CeedChkBackend(ierr);
1125     if (emode==CEED_EVAL_GRAD)
1126     {
1127       if (useCollograd) {
1128         //Accumulator for gradient slices
1129         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1130         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1131         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1132         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1133         code << "      }\n";
1134         code << "    }\n";
1135       } else {
1136         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1137       }
1138     }
1139     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1140     {
1141       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1142     }
1143   }
1144   // We treat quadrature points per slice in 3d to save registers
1145   if (useCollograd) {
1146     code << "\n    // Note: Collocated Gradient\n";
1147     code << "#pragma unroll\n";
1148     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1149     code << "      // -- Input fields --\n";
1150     for (CeedInt i = 0; i < numinputfields; i++) {
1151       code << "      // ---- Input field "<<i<<" ----\n";
1152       // Get elemsize, emode, ncomp
1153       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1154       CeedChkBackend(ierr);
1155       // Basis action
1156       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1157       switch (emode) {
1158       case CEED_EVAL_NONE:
1159         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1160 
1161         bool isStrided;
1162         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1163         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1164         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1165         if (!isStrided) {
1166           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1167           CeedChkBackend(ierr);
1168           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1169           CeedInt compstride;
1170           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1171           code << "      // CompStride: "<<compstride<<"\n";
1172           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1173           data->indices.in[i] = restr_data->d_ind;
1174           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1175         } else {
1176           bool backendstrides;
1177           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1178           CeedChkBackend(ierr);
1179           CeedInt nelem;
1180           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1181           CeedChkBackend(ierr);
1182           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1183           if (!backendstrides) {
1184             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1185             CeedChkBackend(ierr);
1186           }
1187           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1188           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1189         }
1190         break;
1191       case CEED_EVAL_INTERP:
1192         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1193         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1194         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1195         code << "      }\n";
1196         break;
1197       case CEED_EVAL_GRAD:
1198         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1199         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1200         break;
1201       case CEED_EVAL_WEIGHT:
1202         code << "      CeedScalar r_q"<<i<<"[1];\n";
1203         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1204         break; // No action
1205       case CEED_EVAL_DIV:
1206         break; // TODO: Not implemented
1207       case CEED_EVAL_CURL:
1208         break; // TODO: Not implemented
1209       }
1210     }
1211     code << "\n      // -- Output fields --\n";
1212     for (CeedInt i = 0; i < numoutputfields; i++) {
1213       code << "      // ---- Output field "<<i<<" ----\n";
1214       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1215       CeedChkBackend(ierr);
1216       // Basis action
1217       switch (emode) {
1218       case CEED_EVAL_NONE:
1219         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1220         break; // No action
1221       case CEED_EVAL_INTERP:
1222         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1223         break;
1224       case CEED_EVAL_GRAD:
1225         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1226         break;
1227       case CEED_EVAL_WEIGHT:
1228         break; // Should not occur
1229       case CEED_EVAL_DIV:
1230         break; // TODO: Not implemented
1231       case CEED_EVAL_CURL:
1232         break; // TODO: Not implemented
1233       }
1234     }
1235   } else {
1236     code << "\n      // Note: No Collocated Gradient\n";
1237     code << "      // -- Input fields --\n";
1238     for (CeedInt i = 0; i < numinputfields; i++) {
1239       code << "      // ---- Input field "<<i<<" ----\n";
1240       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1241     }
1242     code << "      // -- Output fields --\n";
1243     for (CeedInt i = 0; i < numoutputfields; i++) {
1244       code << "      // ---- Output field "<<i<<" ----\n";
1245       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1246     }
1247   }
1248   code << "\n      // -- QFunction Inputs and outputs --\n";
1249   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1250   for (CeedInt i = 0; i < numinputfields; i++) {
1251     code << "      // ---- Input field "<<i<<" ----\n";
1252     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1253   }
1254   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1255   for (CeedInt i = 0; i < numoutputfields; i++) {
1256     code << "      // ---- Output field "<<i<<" ----\n";
1257     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1258   }
1259   code << "\n      // -- Apply QFunction --\n";
1260   code << "      "<<qFunctionName<<"(ctx, ";
1261   if (dim != 3 || useCollograd) {
1262     code << "1";
1263   } else {
1264     code << "Q1d";
1265   }
1266   code << ", in, out);\n";
1267   if (useCollograd) {
1268     code << "\n      // Note: Collocated Gradient\n";
1269     code << "      // -- Output fields --\n";
1270     for (CeedInt i = 0; i < numoutputfields; i++) {
1271       code << "      // ---- Output field "<<i<<" ----\n";
1272       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1273       CeedChkBackend(ierr);
1274       // Basis action
1275       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1276       switch (emode) {
1277       case CEED_EVAL_NONE:
1278         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1279         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1280         code << "      }\n";
1281         break; // No action
1282       case CEED_EVAL_INTERP:
1283         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1284         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1285         code << "      }\n";
1286         break;
1287       case CEED_EVAL_GRAD:
1288         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1289         break;
1290       case CEED_EVAL_WEIGHT:
1291         break; // Should not occur
1292       case CEED_EVAL_DIV:
1293         break; // TODO: Not implemented
1294       case CEED_EVAL_CURL:
1295         break; // TODO: Not implemented
1296       }
1297     }
1298     code << "    }\n";
1299   }
1300 
1301   // Output basis apply if needed
1302   // Generate the correct eval mode code for each output
1303   code << "\n    // -- Output field basis action and restrictions --\n";
1304   for (CeedInt i = 0; i < numoutputfields; i++) {
1305     code << "    // ---- Output field "<<i<<" ----\n";
1306     // Get elemsize, emode, ncomp
1307     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1308     CeedChkBackend(ierr);
1309     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1310     CeedChkBackend(ierr);
1311     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1312     CeedChkBackend(ierr);
1313     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1314     CeedChkBackend(ierr);
1315     // Basis action
1316     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1317     switch (emode) {
1318     case CEED_EVAL_NONE:
1319       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1320       break; // No action
1321     case CEED_EVAL_INTERP:
1322       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1323       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1324       break;
1325     case CEED_EVAL_GRAD:
1326       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1327       if (useCollograd) {
1328         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1329       } else {
1330         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1331       }
1332       break;
1333     // LCOV_EXCL_START
1334     case CEED_EVAL_WEIGHT: {
1335       Ceed ceed;
1336       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1337       return CeedError(ceed, CEED_ERROR_BACKEND,
1338                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1339       break; // Should not occur
1340     }
1341     case CEED_EVAL_DIV:
1342       break; // TODO: Not implemented
1343     case CEED_EVAL_CURL:
1344       break; // TODO: Not implemented
1345       // LCOV_EXCL_STOP
1346     }
1347     // Restriction
1348       bool isStrided;
1349       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1350     if (!isStrided) {
1351       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1352       CeedChkBackend(ierr);
1353       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1354       CeedInt compstride;
1355       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1356       code << "    // CompStride: "<<compstride<<"\n";
1357       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1358       data->indices.out[i] = restr_data->d_ind;
1359       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1360     } else {
1361       bool backendstrides;
1362       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1363       CeedChkBackend(ierr);
1364       CeedInt nelem;
1365       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1366       CeedChkBackend(ierr);
1367       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1368       if (!backendstrides) {
1369         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1370         CeedChkBackend(ierr);
1371       }
1372       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1373       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1374     }
1375   }
1376 
1377   code << "  }\n";
1378   code << "}\n";
1379   code << "// -----------------------------------------------------------------------------\n\n";
1380 
1381   // View kernel for debugging
1382   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
1383   CeedDebug(ceed, code.str().c_str());
1384 
1385   CeedInt block_sizes[3] = {0, 0, 0};
1386   ierr = BlockGridCalculate_Hip_gen(dim, numelements, data->maxP1d, Q1d, block_sizes);
1387   CeedChkBackend(ierr);
1388   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2,
1389                          "T1d", block_sizes[0],
1390                          "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]);
1391   CeedChkBackend(ierr);
1392   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1393   CeedChkBackend(ierr);
1394 
1395   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1396   return CEED_ERROR_SUCCESS;
1397 }
1398 //------------------------------------------------------------------------------
1399