xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 990fdeb6bb8fc9af2da4472bdc0d1f57da5da0e5)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed/ceed.h>
11 #include <ceed/backend.h>
12 #include <iostream>
13 #include <string>
14 #include <sstream>
15 #include "ceed-hip-gen.h"
16 #include "../hip-ref/ceed-hip-ref.h"
17 #include "../hip-shared/ceed-hip-shared.h"
18 #include "../hip/ceed-hip-compile.h"
19 
20 
21 //------------------------------------------------------------------------------
22 // Calculate the block size used for launching the operator kernel
23 //------------------------------------------------------------------------------
24 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt nelem,
25      	                                  const CeedInt P1d, const CeedInt Q1d,
26 				          CeedInt *block_sizes) {
27 
28   const CeedInt thread1d = CeedIntMax(Q1d, P1d);
29   if (dim==1) {
30     CeedInt elemsPerBlock = 64*thread1d > 256? 256/thread1d : 64;
31     elemsPerBlock = elemsPerBlock>0?elemsPerBlock:1;
32     block_sizes[0] = thread1d;
33     block_sizes[1] = 1;
34     block_sizes[2] = elemsPerBlock;
35   } else if (dim==2) {
36     const CeedInt elemsPerBlock = thread1d<4? 16 : 2;
37     block_sizes[0] = thread1d;
38     block_sizes[1] = thread1d;
39     block_sizes[2] = elemsPerBlock;
40   } else if (dim==3) {
41     const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1);
42     block_sizes[0] = thread1d;
43     block_sizes[1] = thread1d;
44     block_sizes[2] = elemsPerBlock;
45   }
46   return CEED_ERROR_SUCCESS;
47 }
48 
49 static const char *deviceFunctions = QUOTE(
50 
51 //------------------------------------------------------------------------------
52 // Typedefs
53 //------------------------------------------------------------------------------
54 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
55 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
56 
57 typedef struct {
58   CeedInt tidx;
59   CeedInt tidy;
60   CeedInt tidz;
61   CeedInt tid;
62   CeedScalar* slice;
63 } BackendData;
64 
65 //------------------------------------------------------------------------------
66 // Load matrices for basis actions
67 //------------------------------------------------------------------------------
68 template <int P, int Q>
69 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
70   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
71     B[i] = d_B[i];
72 }
73 
74 //------------------------------------------------------------------------------
75 // 1D
76 //------------------------------------------------------------------------------
77 
78 //------------------------------------------------------------------------------
79 // L-vector -> E-vector, offsets provided
80 //------------------------------------------------------------------------------
81 template <int NCOMP, int COMPSTRIDE, int P1d>
82 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
83   if (data.tidx < P1d) {
84     const CeedInt node = data.tidx;
85     const CeedInt ind = indices[node + elem * P1d];
86     for (CeedInt comp = 0; comp < NCOMP; ++comp)
87       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // L-vector -> E-vector, strided
93 //------------------------------------------------------------------------------
94 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
95 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
96   if (data.tidx < P1d) {
97     const CeedInt node = data.tidx;
98     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
99     for (CeedInt comp = 0; comp < NCOMP; ++comp)
100       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
101   }
102 }
103 
104 //------------------------------------------------------------------------------
105 // E-vector -> L-vector, offsets provided
106 //------------------------------------------------------------------------------
107 template <int NCOMP, int COMPSTRIDE, int P1d>
108 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
109   if (data.tidx < P1d) {
110     const CeedInt node = data.tidx;
111     const CeedInt ind = indices[node + elem * P1d];
112     for (CeedInt comp = 0; comp < NCOMP; ++comp)
113       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
114   }
115 }
116 
117 //------------------------------------------------------------------------------
118 // E-vector -> L-vector, strided
119 //------------------------------------------------------------------------------
120 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
121 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
122   if (data.tidx < P1d) {
123     const CeedInt node = data.tidx;
124     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
125     for (CeedInt comp = 0; comp < NCOMP; ++comp)
126       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
127   }
128 }
129 
130 //------------------------------------------------------------------------------
131 // 1D tensor contraction x
132 //------------------------------------------------------------------------------
133 template <int NCOMP, int P1d, int Q1d>
134 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
135   data.slice[data.tidx] = *U;
136   __syncthreads();
137   *V = 0.0;
138   if (data.tidx < Q1d)
139     for (CeedInt i = 0; i < P1d; ++i)
140       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
141   __syncthreads();
142 }
143 
144 //------------------------------------------------------------------------------
145 // 1D transpose tensor contraction x
146 //------------------------------------------------------------------------------
147 template <int NCOMP, int P1d, int Q1d>
148 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
149   data.slice[data.tidx] = *U;
150   __syncthreads();
151   *V = 0.0;
152   if (data.tidx < P1d)
153     for (CeedInt i = 0; i < Q1d; ++i)
154       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
155   __syncthreads();
156 }
157 
158 //------------------------------------------------------------------------------
159 // 1D interpolate to quadrature points
160 //------------------------------------------------------------------------------
161 template <int NCOMP, int P1d, int Q1d>
162 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
163   for (CeedInt comp = 0; comp < NCOMP; comp++)
164     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
165 }
166 
167 //------------------------------------------------------------------------------
168 // 1D interpolate transpose
169 //------------------------------------------------------------------------------
170 template <int NCOMP, int P1d, int Q1d>
171 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
172   for (CeedInt comp=0; comp<NCOMP; comp++)
173     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
174 }
175 
176 //------------------------------------------------------------------------------
177 // 1D derivatives at quadrature points
178 //------------------------------------------------------------------------------
179 template <int NCOMP, int P1d, int Q1d>
180 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
181   for (CeedInt comp = 0; comp < NCOMP; comp++)
182     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
183 }
184 
185 //------------------------------------------------------------------------------
186 // 1D derivatives transpose
187 //------------------------------------------------------------------------------
188 template <int NCOMP, int P1d, int Q1d>
189 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
190   for (CeedInt comp = 0; comp < NCOMP; comp++)
191     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
192 }
193 
194 //------------------------------------------------------------------------------
195 // 2D
196 //------------------------------------------------------------------------------
197 
198 //------------------------------------------------------------------------------
199 // L-vector -> E-vector, offsets provided
200 //------------------------------------------------------------------------------
201 template <int NCOMP, int COMPSTRIDE, int P1d>
202 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
203   if (data.tidx < P1d && data.tidy < P1d) {
204     const CeedInt node = data.tidx + data.tidy*P1d;
205     const CeedInt ind = indices[node + elem * P1d*P1d];
206     for (CeedInt comp = 0; comp < NCOMP; ++comp)
207       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
208   }
209 }
210 
211 //------------------------------------------------------------------------------
212 // L-vector -> E-vector, strided
213 //------------------------------------------------------------------------------
214 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
215 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
216   if (data.tidx < P1d && data.tidy < P1d) {
217     const CeedInt node = data.tidx + data.tidy*P1d;
218     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
219     for (CeedInt comp = 0; comp < NCOMP; ++comp)
220       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
221   }
222 }
223 
224 //------------------------------------------------------------------------------
225 // E-vector -> L-vector, offsets provided
226 //------------------------------------------------------------------------------
227 template <int NCOMP, int COMPSTRIDE, int P1d>
228 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
229   if (data.tidx < P1d && data.tidy < P1d) {
230     const CeedInt node = data.tidx + data.tidy*P1d;
231     const CeedInt ind = indices[node + elem * P1d*P1d];
232     for (CeedInt comp = 0; comp < NCOMP; ++comp)
233       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
234   }
235 }
236 
237 //------------------------------------------------------------------------------
238 // E-vector -> L-vector, strided
239 //------------------------------------------------------------------------------
240 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
241 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
242   if (data.tidx < P1d && data.tidy < P1d) {
243     const CeedInt node = data.tidx + data.tidy*P1d;
244     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
245     for (CeedInt comp = 0; comp < NCOMP; ++comp)
246       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
247   }
248 }
249 
250 //------------------------------------------------------------------------------
251 // 2D tensor contraction x
252 //------------------------------------------------------------------------------
253 template <int NCOMP, int P1d, int Q1d>
254 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
255   data.slice[data.tidx+data.tidy*T1d] = *U;
256   __syncthreads();
257   *V = 0.0;
258   if (data.tidx < Q1d && data.tidy < P1d)
259     for (CeedInt i = 0; i < P1d; ++i)
260       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
261   __syncthreads();
262 }
263 
264 //------------------------------------------------------------------------------
265 // 2D tensor contract y
266 //------------------------------------------------------------------------------
267 template <int NCOMP, int P1d, int Q1d>
268 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
269   data.slice[data.tidx+data.tidy*T1d] = *U;
270   __syncthreads();
271   *V = 0.0;
272   if (data.tidx < Q1d && data.tidy < Q1d)
273     for (CeedInt i = 0; i < P1d; ++i)
274       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
275   __syncthreads();
276 }
277 
278 //------------------------------------------------------------------------------
279 // 2D transpose tensor contract y
280 //------------------------------------------------------------------------------
281 template <int NCOMP, int P1d, int Q1d>
282 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
283   data.slice[data.tidx+data.tidy*T1d] = *U;
284   __syncthreads();
285   *V = 0.0;
286   if (data.tidx < Q1d && data.tidy < P1d)
287     for (CeedInt i = 0; i < Q1d; ++i)
288       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
289   __syncthreads();
290 }
291 
292 //------------------------------------------------------------------------------
293 // 2D transpose tensor contract x
294 //------------------------------------------------------------------------------
295 template <int NCOMP, int P1d, int Q1d>
296 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
297   data.slice[data.tidx+data.tidy*T1d] = *U;
298   __syncthreads();
299   *V = 0.0;
300   if (data.tidx < P1d && data.tidy < P1d)
301     for (CeedInt i = 0; i < Q1d; ++i)
302       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
303   __syncthreads();
304 }
305 
306 //------------------------------------------------------------------------------
307 // 2D transpose tensor contract and add x
308 //------------------------------------------------------------------------------
309 template <int NCOMP, int P1d, int Q1d>
310 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
311   data.slice[data.tidx+data.tidy*T1d] = *U;
312   __syncthreads();
313   if (data.tidx < P1d && data.tidy < P1d)
314     for (CeedInt i = 0; i < Q1d; ++i)
315       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
316   __syncthreads();
317 }
318 
319 //------------------------------------------------------------------------------
320 // 2D interpolate to quadrature points
321 //------------------------------------------------------------------------------
322 template <int NCOMP, int P1d, int Q1d>
323 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
324   CeedScalar r_t[1];
325   for (CeedInt comp = 0; comp < NCOMP; comp++) {
326     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
327     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
328   }
329 }
330 
331 //------------------------------------------------------------------------------
332 // 2D interpolate transpose
333 //------------------------------------------------------------------------------
334 template <int NCOMP, int P1d, int Q1d>
335 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
336   CeedScalar r_t[1];
337   for (CeedInt comp = 0; comp < NCOMP; comp++) {
338     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
339     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
340   }
341 }
342 
343 //------------------------------------------------------------------------------
344 // 2D derivatives at quadrature points
345 //------------------------------------------------------------------------------
346 template <int NCOMP, int P1d, int Q1d>
347 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
348   CeedScalar r_t[1];
349   for (CeedInt comp = 0; comp < NCOMP; comp++) {
350     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
351     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
352     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
353     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
354   }
355 }
356 
357 //------------------------------------------------------------------------------
358 // 2D derivatives transpose
359 //------------------------------------------------------------------------------
360 template <int NCOMP, int P1d, int Q1d>
361 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
362   CeedScalar r_t[1];
363   for (CeedInt comp = 0; comp < NCOMP; comp++) {
364     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
365     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
366     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
367     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
368   }
369 }
370 
371 //------------------------------------------------------------------------------
372 // 3D
373 //------------------------------------------------------------------------------
374 
375 //------------------------------------------------------------------------------
376 // L-vector -> E-vector, offsets provided
377 //------------------------------------------------------------------------------
378 template <int NCOMP, int COMPSTRIDE, int P1d>
379 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
380   if (data.tidx < P1d && data.tidy < P1d)
381     for (CeedInt z = 0; z < P1d; ++z) {
382       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
383       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
384       for (CeedInt comp = 0; comp < NCOMP; ++comp)
385         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
386     }
387 }
388 
389 //------------------------------------------------------------------------------
390 // L-vector -> E-vector, strided
391 //------------------------------------------------------------------------------
392 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
393 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
394   if (data.tidx < P1d && data.tidy < P1d)
395     for (CeedInt z = 0; z < P1d; ++z) {
396       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
397       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
398       for (CeedInt comp = 0; comp < NCOMP; ++comp)
399         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
400     }
401 }
402 
403 //------------------------------------------------------------------------------
404 // E-vector -> Q-vector, offests provided
405 //------------------------------------------------------------------------------
406 template <int NCOMP, int COMPSTRIDE, int Q1d>
407 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
408   if (data.tidx < Q1d && data.tidy < Q1d) {
409     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
410     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
411     for (CeedInt comp = 0; comp < NCOMP; ++comp)
412       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
413   }
414 }
415 
416 //------------------------------------------------------------------------------
417 // E-vector -> Q-vector, strided
418 //------------------------------------------------------------------------------
419 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
420 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
421   if (data.tidx < Q1d && data.tidy < Q1d) {
422     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
423     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
424     for (CeedInt comp = 0; comp < NCOMP; ++comp)
425       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
426   }
427 }
428 
429 //------------------------------------------------------------------------------
430 // E-vector -> L-vector, offsets provided
431 //------------------------------------------------------------------------------
432 template <int NCOMP, int COMPSTRIDE, int P1d>
433 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
434   if (data.tidx < P1d && data.tidy < P1d)
435     for (CeedInt z = 0; z < P1d; ++z) {
436       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
437       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
438       for (CeedInt comp = 0; comp < NCOMP; ++comp)
439         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
440     }
441 }
442 
443 //------------------------------------------------------------------------------
444 // E-vector -> L-vector, strided
445 //------------------------------------------------------------------------------
446 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
447 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
448   if (data.tidx < P1d && data.tidy < P1d)
449     for (CeedInt z = 0; z < P1d; ++z) {
450       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
451       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
452       for (CeedInt comp = 0; comp < NCOMP; ++comp)
453         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
454     }
455 }
456 
457 //------------------------------------------------------------------------------
458 // 3D tensor contract x
459 //------------------------------------------------------------------------------
460 template <int NCOMP, int P1d, int Q1d>
461 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
462   CeedScalar r_B[P1d];
463   for (CeedInt i = 0; i < P1d; ++i)
464     r_B[i] = B[i + data.tidx*P1d];
465 
466   for (CeedInt k = 0; k < P1d; ++k) {
467     data.slice[data.tidx+data.tidy*T1d] = U[k];
468     __syncthreads();
469     V[k] = 0.0;
470     if (data.tidx < Q1d && data.tidy < P1d)
471       for (CeedInt i = 0; i < P1d; ++i)
472         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
473     __syncthreads();
474   }
475 }
476 
477 //------------------------------------------------------------------------------
478 // 3D tensor contract y
479 //------------------------------------------------------------------------------
480 template <int NCOMP, int P1d, int Q1d>
481 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
482   CeedScalar r_B[P1d];
483   for (CeedInt i = 0; i < P1d; ++i)
484     r_B[i] = B[i + data.tidy*P1d];
485 
486   for (CeedInt k = 0; k < P1d; ++k) {
487     data.slice[data.tidx+data.tidy*T1d] = U[k];
488     __syncthreads();
489     V[k] = 0.0;
490     if (data.tidx < Q1d && data.tidy < Q1d)
491       for (CeedInt i = 0; i < P1d; ++i)
492         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
493     __syncthreads();
494   }
495 }
496 
497 //------------------------------------------------------------------------------
498 // 3D tensor contract z
499 //------------------------------------------------------------------------------
500 template <int NCOMP, int P1d, int Q1d>
501 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
502   for (CeedInt k = 0; k < Q1d; ++k) {
503     V[k] = 0.0;
504     if (data.tidx < Q1d && data.tidy < Q1d)
505       for (CeedInt i = 0; i < P1d; ++i)
506         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
507   }
508 }
509 
510 //------------------------------------------------------------------------------
511 // 3D transpose tensor contract z
512 //------------------------------------------------------------------------------
513 template <int NCOMP, int P1d, int Q1d>
514 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
515   for (CeedInt k = 0; k < P1d; ++k) {
516     V[k] = 0.0;
517     if (data.tidx < Q1d && data.tidy < Q1d)
518       for (CeedInt i = 0; i < Q1d; ++i)
519         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
520   }
521 }
522 
523 //------------------------------------------------------------------------------
524 // 3D transpose tensor contract y
525 //------------------------------------------------------------------------------
526 template <int NCOMP, int P1d, int Q1d>
527 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
528   CeedScalar r_B[Q1d];
529   for (CeedInt i = 0; i < Q1d; ++i)
530     r_B[i] = B[data.tidy + i*P1d];
531 
532   for (CeedInt k = 0; k < P1d; ++k) {
533     data.slice[data.tidx+data.tidy*T1d] = U[k];
534     __syncthreads();
535     V[k] = 0.0;
536     if (data.tidx < Q1d && data.tidy < P1d)
537       for (CeedInt i = 0; i < Q1d; ++i)
538         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
539     __syncthreads();
540   }
541 }
542 
543 //------------------------------------------------------------------------------
544 // 3D transpose tensor contract add y
545 //------------------------------------------------------------------------------
546 template <int NCOMP, int P1d, int Q1d>
547 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
548   CeedScalar r_B[Q1d];
549   for (CeedInt i = 0; i < Q1d; ++i)
550     r_B[i] = B[data.tidy + i*P1d];
551 
552   for (CeedInt k = 0; k < P1d; ++k) {
553     data.slice[data.tidx+data.tidy*T1d] = U[k];
554     __syncthreads();
555     if (data.tidx < Q1d && data.tidy < P1d)
556       for (CeedInt i = 0; i < Q1d; ++i)
557         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
558     __syncthreads();
559   }
560 }
561 
562 //------------------------------------------------------------------------------
563 // 3D transpose tensor contract x
564 //------------------------------------------------------------------------------
565 template <int NCOMP, int P1d, int Q1d>
566 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
567   CeedScalar r_B[Q1d];
568   for (CeedInt i = 0; i < Q1d; ++i)
569     r_B[i] = B[data.tidx + i*P1d];
570 
571   for (CeedInt k = 0; k < P1d; ++k) {
572     data.slice[data.tidx+data.tidy*T1d] = U[k];
573     __syncthreads();
574     V[k] = 0.0;
575     if (data.tidx < P1d && data.tidy < P1d)
576       for (CeedInt i = 0; i < Q1d; ++i)
577         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
578     __syncthreads();
579   }
580 }
581 
582 //------------------------------------------------------------------------------
583 // 3D transpose tensor contract add x
584 //------------------------------------------------------------------------------
585 template <int NCOMP, int P1d, int Q1d>
586 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
587   CeedScalar r_B[Q1d];
588   for (CeedInt i = 0; i < Q1d; ++i)
589     r_B[i] = B[data.tidx + i*P1d];
590 
591   for (CeedInt k = 0; k < P1d; ++k) {
592     data.slice[data.tidx+data.tidy*T1d] = U[k];
593     __syncthreads();
594     if (data.tidx < P1d && data.tidy < P1d)
595       for (CeedInt i = 0; i < Q1d; ++i)
596         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
597     __syncthreads();
598   }
599 }
600 
601 //------------------------------------------------------------------------------
602 // 3D interpolate to quadrature points
603 //------------------------------------------------------------------------------
604 template <int NCOMP, int P1d, int Q1d>
605 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
606   CeedScalar r_t1[T1d];
607   CeedScalar r_t2[T1d];
608   for (CeedInt comp = 0; comp < NCOMP; comp++) {
609     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
610     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
611     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
612   }
613 }
614 
615 //------------------------------------------------------------------------------
616 // 3D interpolate transpose
617 //------------------------------------------------------------------------------
618 template <int NCOMP, int P1d, int Q1d>
619 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
620   CeedScalar r_t1[T1d];
621   CeedScalar r_t2[T1d];
622   for (CeedInt comp = 0; comp < NCOMP; comp++) {
623     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
624     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
625     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
626   }
627 }
628 
629 //------------------------------------------------------------------------------
630 // 3D derivatives at quadrature points
631 //------------------------------------------------------------------------------
632 template <int NCOMP, int P1d, int Q1d>
633 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
634   CeedScalar r_t1[T1d];
635   CeedScalar r_t2[T1d];
636   for (CeedInt comp = 0; comp < NCOMP; comp++) {
637     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
638     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
639     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
640     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
641     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
642     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
643     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
644     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
645     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
646   }
647 }
648 
649 //------------------------------------------------------------------------------
650 // 3D derivatives transpose
651 //------------------------------------------------------------------------------
652 template <int NCOMP, int P1d, int Q1d>
653 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
654   CeedScalar r_t1[T1d];
655   CeedScalar r_t2[T1d];
656   for (CeedInt comp = 0; comp < NCOMP; comp++) {
657     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
658     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
659     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
660     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
661     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
662     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
663     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
664     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
665     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
666   }
667 }
668 
669 //------------------------------------------------------------------------------
670 // 3D collocated derivatives computation
671 //------------------------------------------------------------------------------
672 template <int NCOMP, int Q1d>
673 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
674   if (data.tidx < Q1d && data.tidy < Q1d) {
675     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
676       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
677       __syncthreads();
678       // X derivative
679       r_V[comp+0*NCOMP] = 0.0;
680       for (CeedInt i = 0; i < Q1d; ++i)
681         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
682       // Y derivative
683       r_V[comp+1*NCOMP] = 0.0;
684       for (CeedInt i = 0; i < Q1d; ++i)
685         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
686       // Z derivative
687       r_V[comp+2*NCOMP] = 0.0;
688       for (CeedInt i = 0; i < Q1d; ++i)
689         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
690       __syncthreads();
691     }
692   }
693 }
694 
695 //------------------------------------------------------------------------------
696 // 3D collocated derivatives transpose
697 //------------------------------------------------------------------------------
698 template <int NCOMP, int Q1d>
699 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
700   if (data.tidx < Q1d && data.tidy < Q1d) {
701     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
702       // X derivative
703       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
704       __syncthreads();
705       for (CeedInt i = 0; i < Q1d; ++i)
706         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
707       __syncthreads();
708       // Y derivative
709       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
710       __syncthreads();
711       for (CeedInt i = 0; i < Q1d; ++i)
712         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
713       __syncthreads();
714       // Z derivative
715       for (CeedInt i = 0; i < Q1d; ++i)
716         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
717     }
718   }
719 }
720 
721 //------------------------------------------------------------------------------
722 // 1D quadrature weights
723 //------------------------------------------------------------------------------
724 template <int Q1d>
725 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
726   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
727 }
728 
729 //------------------------------------------------------------------------------
730 // 2D quadrature weights
731 //------------------------------------------------------------------------------
732 template <int Q1d>
733 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
734   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
735         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
736 }
737 
738 //------------------------------------------------------------------------------
739 // 3D quadrature weights
740 //------------------------------------------------------------------------------
741 template <int Q1d>
742 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
743   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
744   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
745   for (CeedInt z = 0; z < Q1d; ++z)
746     w[z] = quad ? pw*qweight1d[z] : 0.0;
747 }
748 
749 );
750 //------------------------------------------------------------------------------
751 // Build singe operator kernel
752 //------------------------------------------------------------------------------
753 extern "C" int CeedHipGenOperatorBuild(CeedOperator op) {
754 
755   using std::ostringstream;
756   using std::string;
757   int ierr;
758   bool setupdone;
759   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
760   if (setupdone) return CEED_ERROR_SUCCESS;
761   Ceed ceed;
762   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
763   CeedOperator_Hip_gen *data;
764   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
765   CeedQFunction qf;
766   CeedQFunction_Hip_gen *qf_data;
767   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
768   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
769   CeedSize lsize;
770   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
771           numoutputfields, ncomp, dim = 1;
772   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
773   Q1d = Q;
774   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
775   CeedOperatorField *opinputfields, *opoutputfields;
776   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
777   CeedChkBackend(ierr);
778   CeedQFunctionField *qfinputfields, *qfoutputfields;
779   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
780   CeedChkBackend(ierr);
781   CeedEvalMode emode;
782   CeedBasis basis;
783   CeedBasis_Hip_shared *basis_data;
784   CeedElemRestriction Erestrict;
785   CeedElemRestriction_Hip *restr_data;
786 
787   // Check for restriction only identity operator
788   bool is_identity_qf;
789   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
790   if (is_identity_qf) {
791     CeedEvalMode emodein, emodeout;
792     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
793     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
794     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
795       // LCOV_EXCL_START
796       return CeedError(ceed, CEED_ERROR_BACKEND,
797                        "Backend does not implement restriction only identity operators");
798     // LCOV_EXCL_STOP
799   }
800 
801   ostringstream code;
802   string devFunctions(deviceFunctions);
803   code << devFunctions;
804 
805   string qFunction(qf_data->qFunctionSource);
806   string qFunctionName(qf_data->qFunctionName);
807   string oper;
808   oper = "CeedKernel_Hip_gen_" + qFunctionName;
809 
810   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
811   code << "#define CEED_QFUNCTION_HELPER __device__ __forceinline__\n";
812   code << "#define CeedPragmaSIMD\n";
813   code << "#define CEED_ERROR_SUCCESS 0\n\n";
814 
815   // Find dim and Q1d
816   bool useCollograd = true;
817   data->maxP1d = 0;
818   for (CeedInt i = 0; i < numinputfields; i++) {
819     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
820     if (basis != CEED_BASIS_COLLOCATED) {
821       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
822       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
823       CeedChkBackend(ierr);
824 
825       // Check for collocated gradient
826       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
827 
828       // Collect dim and Q1d
829       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
830       bool isTensor;
831       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
832       if (isTensor) {
833         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
834         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
835         if (P1d>data->maxP1d) data->maxP1d = P1d;
836       } else {
837         // LCOV_EXCL_START
838         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
839         // LCOV_EXCL_STOP
840         }
841     }
842   }
843   // Check output bases for Q1d, dim as well
844   //   The only imput basis might be CEED_BASIS_COLLOCATED
845   for (CeedInt i = 0; i < numoutputfields; i++) {
846     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
847 
848     if (basis != CEED_BASIS_COLLOCATED) {
849       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
850       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
851       CeedChkBackend(ierr);
852 
853       // Collect dim and Q1d
854       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
855       bool isTensor;
856       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
857       if (isTensor) {
858         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
859       } else {
860         // LCOV_EXCL_START
861         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
862         // LCOV_EXCL_STOP
863         }
864 
865       // Check for collocated gradient
866       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
867     }
868   }
869   data->dim = dim;
870   data->Q1d = Q1d;
871 
872   // Define CEED_Q_VLA
873   if (dim != 3 || useCollograd) {
874     code << "\n#define CEED_Q_VLA 1\n\n";
875   } else {
876     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
877   }
878 
879   code << qFunction;
880 
881   // Setup
882   code << "\n// -----------------------------------------------------------------------------\n";
883   code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
884   code << "__global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
885   for (CeedInt i = 0; i < numinputfields; i++) {
886     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
887     CeedChkBackend(ierr);
888     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
889       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
890     }
891   }
892 
893   for (CeedInt i = 0; i < numoutputfields; i++) {
894     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
895   }
896 
897   code << "  const CeedInt Dim = "<<dim<<";\n";
898   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
899 
900   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
901   code << "  BackendData data;\n";
902   code << "  data.tidx = threadIdx.x;\n";
903   code << "  data.tidy = threadIdx.y;\n";
904   code << "  data.tidz = threadIdx.z;\n";
905   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
906   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
907 
908   code << "\n  // -- Input field constants and basis data --\n";
909   //Initialize constants, and matrices B and G
910   for (CeedInt i = 0; i < numinputfields; i++) {
911     code << "  // ---- Input field "<<i<<" ----\n";
912     // Get elemsize, emode, ncomp
913     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
914     CeedChkBackend(ierr);
915     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
916     CeedChkBackend(ierr);
917     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
918     CeedChkBackend(ierr);
919     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
920     CeedChkBackend(ierr);
921 
922     // Set field constants
923     if (emode != CEED_EVAL_WEIGHT) {
924       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
925       if (basis != CEED_BASIS_COLLOCATED) {
926         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
927         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
928       } else {
929         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
930       }
931       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
932     }
933 
934     // Load basis data
935     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
936     switch (emode) {
937     case CEED_EVAL_NONE:
938       break;
939     case CEED_EVAL_INTERP:
940       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
941       data->B.in[i] = basis_data->d_interp_1d;
942       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
943       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
944       break;
945     case CEED_EVAL_GRAD:
946       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
947       data->B.in[i] = basis_data->d_interp_1d;
948       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
949       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
950       if (useCollograd) {
951         data->G.in[i] = basis_data->d_collo_grad_1d;
952         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
953         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
954       } else {
955         data->G.in[i] = basis_data->d_grad_1d;
956         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
957         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
958       }
959       break;
960     case CEED_EVAL_WEIGHT:
961       break; // No action
962     case CEED_EVAL_DIV:
963       break; // TODO: Not implemented
964     case CEED_EVAL_CURL:
965       break; // TODO: Not implemented
966     }
967   }
968 
969   code << "\n  // -- Output field constants and basis data --\n";
970   for (CeedInt i = 0; i < numoutputfields; i++) {
971     code << "  // ---- Output field "<<i<<" ----\n";
972     // Get elemsize, emode, ncomp
973     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
974     CeedChkBackend(ierr);
975     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
976     CeedChkBackend(ierr);
977     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
978     CeedChkBackend(ierr);
979     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
980     CeedChkBackend(ierr);
981 
982     // Set field constants
983     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
984     if (basis != CEED_BASIS_COLLOCATED) {
985       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
986       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
987     } else {
988       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
989     }
990     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
991 
992     // Load basis data
993     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
994     switch (emode) {
995     case CEED_EVAL_NONE:
996       break; // No action
997     case CEED_EVAL_INTERP:
998       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
999       data->B.out[i] = basis_data->d_interp_1d;
1000       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1001       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1002       break;
1003     case CEED_EVAL_GRAD:
1004       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1005       data->B.out[i] = basis_data->d_interp_1d;
1006       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1007       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1008       if (useCollograd) {
1009         data->G.out[i] = basis_data->d_collo_grad_1d;
1010         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1011         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1012       } else {
1013         data->G.out[i] = basis_data->d_grad_1d;
1014         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1015         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1016       }
1017       break;
1018     // LCOV_EXCL_START
1019     case CEED_EVAL_WEIGHT: {
1020       Ceed ceed;
1021       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1022       return CeedError(ceed, CEED_ERROR_BACKEND,
1023                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1024       break; // Should not occur
1025     }
1026     case CEED_EVAL_DIV:
1027       break; // TODO: Not implemented
1028     case CEED_EVAL_CURL:
1029       break; // TODO: Not implemented
1030       // LCOV_EXCL_STOP
1031     }
1032   }
1033   code << "\n  // -- Element loop --\n";
1034   code << "  __syncthreads();\n";
1035   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1036   // Input basis apply if needed
1037   // Generate the correct eval mode code for each input
1038   code << "    // -- Input field restrictions and basis actions --\n";
1039   for (CeedInt i = 0; i < numinputfields; i++) {
1040     code << "    // ---- Input field "<<i<<" ----\n";
1041     // Get elemsize, emode, ncomp
1042     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1043     CeedChkBackend(ierr);
1044     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1045     CeedChkBackend(ierr);
1046     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1047     CeedChkBackend(ierr);
1048     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1049     CeedChkBackend(ierr);
1050 
1051     // Restriction
1052     if (emode != CEED_EVAL_WEIGHT &&
1053         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1054       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1055 
1056       bool isStrided;
1057       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1058       if (!isStrided) {
1059         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1060         CeedChkBackend(ierr);
1061         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1062         CeedInt compstride;
1063         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1064         code << "    // CompStride: "<<compstride<<"\n";
1065         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1066         data->indices.in[i] = restr_data->d_ind;
1067         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1068       } else {
1069         bool backendstrides;
1070         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1071         CeedChkBackend(ierr);
1072         CeedInt nelem;
1073         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1074         CeedChkBackend(ierr);
1075         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1076         if (!backendstrides) {
1077           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1078           CeedChkBackend(ierr);
1079         }
1080         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1081         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1082       }
1083     }
1084 
1085     // Basis action
1086     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1087     switch (emode) {
1088     case CEED_EVAL_NONE:
1089       if (!useCollograd) {
1090         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1091       }
1092       break;
1093     case CEED_EVAL_INTERP:
1094       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1095       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1096       break;
1097     case CEED_EVAL_GRAD:
1098       if (useCollograd) {
1099         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1100         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1101       } else {
1102         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1103         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1104       }
1105       break;
1106     case CEED_EVAL_WEIGHT:
1107       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1108       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1109       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1110       data->W = basis_data->d_q_weight_1d;
1111       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1112       break; // No action
1113     case CEED_EVAL_DIV:
1114       break; // TODO: Not implemented
1115     case CEED_EVAL_CURL:
1116       break; // TODO: Not implemented
1117     }
1118   }
1119 
1120   // Q function
1121   code << "\n    // -- Output field setup --\n";
1122   for (CeedInt i = 0; i < numoutputfields; i++) {
1123       code << "\n    // ---- Output field "<<i<<" ----\n";
1124     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1125     CeedChkBackend(ierr);
1126     if (emode==CEED_EVAL_GRAD)
1127     {
1128       if (useCollograd) {
1129         //Accumulator for gradient slices
1130         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1131         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1132         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1133         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1134         code << "      }\n";
1135         code << "    }\n";
1136       } else {
1137         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1138       }
1139     }
1140     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1141     {
1142       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1143     }
1144   }
1145   // We treat quadrature points per slice in 3d to save registers
1146   if (useCollograd) {
1147     code << "\n    // Note: Collocated Gradient\n";
1148     code << "#pragma unroll\n";
1149     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1150     code << "      // -- Input fields --\n";
1151     for (CeedInt i = 0; i < numinputfields; i++) {
1152       code << "      // ---- Input field "<<i<<" ----\n";
1153       // Get elemsize, emode, ncomp
1154       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1155       CeedChkBackend(ierr);
1156       // Basis action
1157       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1158       switch (emode) {
1159       case CEED_EVAL_NONE:
1160         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1161 
1162         bool isStrided;
1163         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1164         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1165         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1166         if (!isStrided) {
1167           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1168           CeedChkBackend(ierr);
1169           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1170           CeedInt compstride;
1171           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1172           code << "      // CompStride: "<<compstride<<"\n";
1173           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1174           data->indices.in[i] = restr_data->d_ind;
1175           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1176         } else {
1177           bool backendstrides;
1178           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1179           CeedChkBackend(ierr);
1180           CeedInt nelem;
1181           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1182           CeedChkBackend(ierr);
1183           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1184           if (!backendstrides) {
1185             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1186             CeedChkBackend(ierr);
1187           }
1188           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1189           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1190         }
1191         break;
1192       case CEED_EVAL_INTERP:
1193         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1194         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1195         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1196         code << "      }\n";
1197         break;
1198       case CEED_EVAL_GRAD:
1199         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1200         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1201         break;
1202       case CEED_EVAL_WEIGHT:
1203         code << "      CeedScalar r_q"<<i<<"[1];\n";
1204         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1205         break; // No action
1206       case CEED_EVAL_DIV:
1207         break; // TODO: Not implemented
1208       case CEED_EVAL_CURL:
1209         break; // TODO: Not implemented
1210       }
1211     }
1212     code << "\n      // -- Output fields --\n";
1213     for (CeedInt i = 0; i < numoutputfields; i++) {
1214       code << "      // ---- Output field "<<i<<" ----\n";
1215       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1216       CeedChkBackend(ierr);
1217       // Basis action
1218       switch (emode) {
1219       case CEED_EVAL_NONE:
1220         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1221         break; // No action
1222       case CEED_EVAL_INTERP:
1223         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1224         break;
1225       case CEED_EVAL_GRAD:
1226         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1227         break;
1228       case CEED_EVAL_WEIGHT:
1229         break; // Should not occur
1230       case CEED_EVAL_DIV:
1231         break; // TODO: Not implemented
1232       case CEED_EVAL_CURL:
1233         break; // TODO: Not implemented
1234       }
1235     }
1236   } else {
1237     code << "\n      // Note: No Collocated Gradient\n";
1238     code << "      // -- Input fields --\n";
1239     for (CeedInt i = 0; i < numinputfields; i++) {
1240       code << "      // ---- Input field "<<i<<" ----\n";
1241       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1242     }
1243     code << "      // -- Output fields --\n";
1244     for (CeedInt i = 0; i < numoutputfields; i++) {
1245       code << "      // ---- Output field "<<i<<" ----\n";
1246       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1247     }
1248   }
1249   code << "\n      // -- QFunction Inputs and outputs --\n";
1250   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1251   for (CeedInt i = 0; i < numinputfields; i++) {
1252     code << "      // ---- Input field "<<i<<" ----\n";
1253     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1254   }
1255   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1256   for (CeedInt i = 0; i < numoutputfields; i++) {
1257     code << "      // ---- Output field "<<i<<" ----\n";
1258     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1259   }
1260   code << "\n      // -- Apply QFunction --\n";
1261   code << "      "<<qFunctionName<<"(ctx, ";
1262   if (dim != 3 || useCollograd) {
1263     code << "1";
1264   } else {
1265     code << "Q1d";
1266   }
1267   code << ", in, out);\n";
1268   if (useCollograd) {
1269     code << "\n      // Note: Collocated Gradient\n";
1270     code << "      // -- Output fields --\n";
1271     for (CeedInt i = 0; i < numoutputfields; i++) {
1272       code << "      // ---- Output field "<<i<<" ----\n";
1273       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1274       CeedChkBackend(ierr);
1275       // Basis action
1276       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1277       switch (emode) {
1278       case CEED_EVAL_NONE:
1279         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1280         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1281         code << "      }\n";
1282         break; // No action
1283       case CEED_EVAL_INTERP:
1284         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1285         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1286         code << "      }\n";
1287         break;
1288       case CEED_EVAL_GRAD:
1289         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1290         break;
1291       case CEED_EVAL_WEIGHT:
1292         break; // Should not occur
1293       case CEED_EVAL_DIV:
1294         break; // TODO: Not implemented
1295       case CEED_EVAL_CURL:
1296         break; // TODO: Not implemented
1297       }
1298     }
1299     code << "    }\n";
1300   }
1301 
1302   // Output basis apply if needed
1303   // Generate the correct eval mode code for each output
1304   code << "\n    // -- Output field basis action and restrictions --\n";
1305   for (CeedInt i = 0; i < numoutputfields; i++) {
1306     code << "    // ---- Output field "<<i<<" ----\n";
1307     // Get elemsize, emode, ncomp
1308     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1309     CeedChkBackend(ierr);
1310     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1311     CeedChkBackend(ierr);
1312     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1313     CeedChkBackend(ierr);
1314     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1315     CeedChkBackend(ierr);
1316     // Basis action
1317     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1318     switch (emode) {
1319     case CEED_EVAL_NONE:
1320       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1321       break; // No action
1322     case CEED_EVAL_INTERP:
1323       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1324       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1325       break;
1326     case CEED_EVAL_GRAD:
1327       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1328       if (useCollograd) {
1329         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1330       } else {
1331         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1332       }
1333       break;
1334     // LCOV_EXCL_START
1335     case CEED_EVAL_WEIGHT: {
1336       Ceed ceed;
1337       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1338       return CeedError(ceed, CEED_ERROR_BACKEND,
1339                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1340       break; // Should not occur
1341     }
1342     case CEED_EVAL_DIV:
1343       break; // TODO: Not implemented
1344     case CEED_EVAL_CURL:
1345       break; // TODO: Not implemented
1346       // LCOV_EXCL_STOP
1347     }
1348     // Restriction
1349       bool isStrided;
1350       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1351     if (!isStrided) {
1352       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1353       CeedChkBackend(ierr);
1354       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1355       CeedInt compstride;
1356       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1357       code << "    // CompStride: "<<compstride<<"\n";
1358       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1359       data->indices.out[i] = restr_data->d_ind;
1360       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1361     } else {
1362       bool backendstrides;
1363       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1364       CeedChkBackend(ierr);
1365       CeedInt nelem;
1366       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1367       CeedChkBackend(ierr);
1368       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1369       if (!backendstrides) {
1370         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1371         CeedChkBackend(ierr);
1372       }
1373       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1374       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1375     }
1376   }
1377 
1378   code << "  }\n";
1379   code << "}\n";
1380   code << "// -----------------------------------------------------------------------------\n\n";
1381 
1382   // View kernel for debugging
1383   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
1384   CeedDebug(ceed, code.str().c_str());
1385 
1386   CeedInt block_sizes[3] = {0, 0, 0};
1387   ierr = BlockGridCalculate_Hip_gen(dim, numelements, data->maxP1d, Q1d, block_sizes);
1388   CeedChkBackend(ierr);
1389   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2,
1390                          "T1d", block_sizes[0],
1391                          "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]);
1392   CeedChkBackend(ierr);
1393   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1394   CeedChkBackend(ierr);
1395 
1396   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1397   return CEED_ERROR_SUCCESS;
1398 }
1399 //------------------------------------------------------------------------------
1400