xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 41bdf1c9f3b677ca0cd3704eec13319d41b7356c)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed/ceed.h>
11 #include <ceed/backend.h>
12 #include <iostream>
13 #include <string>
14 #include <sstream>
15 #include "ceed-hip-gen.h"
16 #include "../hip-ref/ceed-hip-ref.h"
17 #include "../hip-shared/ceed-hip-shared.h"
18 #include "../hip/ceed-hip-compile.h"
19 
20 
21 //------------------------------------------------------------------------------
22 // Calculate the block size used for launching the operator kernel
23 //------------------------------------------------------------------------------
24 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt nelem,
25      	                                  const CeedInt P1d, const CeedInt Q1d,
26 				          CeedInt *block_sizes) {
27 
28   const CeedInt thread1d = CeedIntMax(Q1d, P1d);
29   if (dim==1) {
30     CeedInt elemsPerBlock = 64*thread1d > 256? 256/thread1d : 64;
31     elemsPerBlock = elemsPerBlock>0?elemsPerBlock:1;
32     block_sizes[0] = thread1d;
33     block_sizes[1] = 1;
34     block_sizes[2] = elemsPerBlock;
35   } else if (dim==2) {
36     const CeedInt elemsPerBlock = thread1d<4? 16 : 2;
37     block_sizes[0] = thread1d;
38     block_sizes[1] = thread1d;
39     block_sizes[2] = elemsPerBlock;
40   } else if (dim==3) {
41     const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1);
42     block_sizes[0] = thread1d;
43     block_sizes[1] = thread1d;
44     block_sizes[2] = elemsPerBlock;
45   }
46   return CEED_ERROR_SUCCESS;
47 }
48 
49 static const char *deviceFunctions = QUOTE(
50 
51 //------------------------------------------------------------------------------
52 // Typedefs
53 //------------------------------------------------------------------------------
54 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
55 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
56 
57 typedef struct {
58   CeedInt tidx;
59   CeedInt tidy;
60   CeedInt tidz;
61   CeedInt tid;
62   CeedScalar* slice;
63 } BackendData;
64 
65 //------------------------------------------------------------------------------
66 // Load matrices for basis actions
67 //------------------------------------------------------------------------------
68 template <int P, int Q>
69 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
70   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
71     B[i] = d_B[i];
72 }
73 
74 //------------------------------------------------------------------------------
75 // 1D
76 //------------------------------------------------------------------------------
77 
78 //------------------------------------------------------------------------------
79 // L-vector -> E-vector, offsets provided
80 //------------------------------------------------------------------------------
81 template <int NCOMP, int COMPSTRIDE, int P1d>
82 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
83   if (data.tidx < P1d) {
84     const CeedInt node = data.tidx;
85     const CeedInt ind = indices[node + elem * P1d];
86     for (CeedInt comp = 0; comp < NCOMP; ++comp)
87       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
88   }
89 }
90 
91 //------------------------------------------------------------------------------
92 // L-vector -> E-vector, strided
93 //------------------------------------------------------------------------------
94 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
95 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
96   if (data.tidx < P1d) {
97     const CeedInt node = data.tidx;
98     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
99     for (CeedInt comp = 0; comp < NCOMP; ++comp)
100       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
101   }
102 }
103 
104 //------------------------------------------------------------------------------
105 // E-vector -> L-vector, offsets provided
106 //------------------------------------------------------------------------------
107 template <int NCOMP, int COMPSTRIDE, int P1d>
108 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
109   if (data.tidx < P1d) {
110     const CeedInt node = data.tidx;
111     const CeedInt ind = indices[node + elem * P1d];
112     for (CeedInt comp = 0; comp < NCOMP; ++comp)
113       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
114   }
115 }
116 
117 //------------------------------------------------------------------------------
118 // E-vector -> L-vector, strided
119 //------------------------------------------------------------------------------
120 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
121 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
122   if (data.tidx < P1d) {
123     const CeedInt node = data.tidx;
124     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
125     for (CeedInt comp = 0; comp < NCOMP; ++comp)
126       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
127   }
128 }
129 
130 //------------------------------------------------------------------------------
131 // 1D tensor contraction x
132 //------------------------------------------------------------------------------
133 template <int NCOMP, int P1d, int Q1d>
134 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
135   data.slice[data.tidx] = *U;
136   __syncthreads();
137   *V = 0.0;
138   if (data.tidx < Q1d)
139     for (CeedInt i = 0; i < P1d; ++i)
140       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
141   __syncthreads();
142 }
143 
144 //------------------------------------------------------------------------------
145 // 1D transpose tensor contraction x
146 //------------------------------------------------------------------------------
147 template <int NCOMP, int P1d, int Q1d>
148 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
149   data.slice[data.tidx] = *U;
150   __syncthreads();
151   *V = 0.0;
152   if (data.tidx < P1d)
153     for (CeedInt i = 0; i < Q1d; ++i)
154       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
155   __syncthreads();
156 }
157 
158 //------------------------------------------------------------------------------
159 // 1D interpolate to quadrature points
160 //------------------------------------------------------------------------------
161 template <int NCOMP, int P1d, int Q1d>
162 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
163   for (CeedInt comp = 0; comp < NCOMP; comp++)
164     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
165 }
166 
167 //------------------------------------------------------------------------------
168 // 1D interpolate transpose
169 //------------------------------------------------------------------------------
170 template <int NCOMP, int P1d, int Q1d>
171 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
172   for (CeedInt comp=0; comp<NCOMP; comp++)
173     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
174 }
175 
176 //------------------------------------------------------------------------------
177 // 1D derivatives at quadrature points
178 //------------------------------------------------------------------------------
179 template <int NCOMP, int P1d, int Q1d>
180 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
181   for (CeedInt comp = 0; comp < NCOMP; comp++)
182     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
183 }
184 
185 //------------------------------------------------------------------------------
186 // 1D derivatives transpose
187 //------------------------------------------------------------------------------
188 template <int NCOMP, int P1d, int Q1d>
189 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
190   for (CeedInt comp = 0; comp < NCOMP; comp++)
191     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
192 }
193 
194 //------------------------------------------------------------------------------
195 // 2D
196 //------------------------------------------------------------------------------
197 
198 //------------------------------------------------------------------------------
199 // L-vector -> E-vector, offsets provided
200 //------------------------------------------------------------------------------
201 template <int NCOMP, int COMPSTRIDE, int P1d>
202 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
203   if (data.tidx < P1d && data.tidy < P1d) {
204     const CeedInt node = data.tidx + data.tidy*P1d;
205     const CeedInt ind = indices[node + elem * P1d*P1d];
206     for (CeedInt comp = 0; comp < NCOMP; ++comp)
207       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
208   }
209 }
210 
211 //------------------------------------------------------------------------------
212 // L-vector -> E-vector, strided
213 //------------------------------------------------------------------------------
214 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
215 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
216   if (data.tidx < P1d && data.tidy < P1d) {
217     const CeedInt node = data.tidx + data.tidy*P1d;
218     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
219     for (CeedInt comp = 0; comp < NCOMP; ++comp)
220       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
221   }
222 }
223 
224 //------------------------------------------------------------------------------
225 // E-vector -> L-vector, offsets provided
226 //------------------------------------------------------------------------------
227 template <int NCOMP, int COMPSTRIDE, int P1d>
228 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
229   if (data.tidx < P1d && data.tidy < P1d) {
230     const CeedInt node = data.tidx + data.tidy*P1d;
231     const CeedInt ind = indices[node + elem * P1d*P1d];
232     for (CeedInt comp = 0; comp < NCOMP; ++comp)
233       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
234   }
235 }
236 
237 //------------------------------------------------------------------------------
238 // E-vector -> L-vector, strided
239 //------------------------------------------------------------------------------
240 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
241 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
242   if (data.tidx < P1d && data.tidy < P1d) {
243     const CeedInt node = data.tidx + data.tidy*P1d;
244     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
245     for (CeedInt comp = 0; comp < NCOMP; ++comp)
246       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
247   }
248 }
249 
250 //------------------------------------------------------------------------------
251 // 2D tensor contraction x
252 //------------------------------------------------------------------------------
253 template <int NCOMP, int P1d, int Q1d>
254 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
255   data.slice[data.tidx+data.tidy*T1d] = *U;
256   __syncthreads();
257   *V = 0.0;
258   if (data.tidx < Q1d && data.tidy < P1d)
259     for (CeedInt i = 0; i < P1d; ++i)
260       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
261   __syncthreads();
262 }
263 
264 //------------------------------------------------------------------------------
265 // 2D tensor contract y
266 //------------------------------------------------------------------------------
267 template <int NCOMP, int P1d, int Q1d>
268 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
269   data.slice[data.tidx+data.tidy*T1d] = *U;
270   __syncthreads();
271   *V = 0.0;
272   if (data.tidx < Q1d && data.tidy < Q1d)
273     for (CeedInt i = 0; i < P1d; ++i)
274       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
275   __syncthreads();
276 }
277 
278 //------------------------------------------------------------------------------
279 // 2D transpose tensor contract y
280 //------------------------------------------------------------------------------
281 template <int NCOMP, int P1d, int Q1d>
282 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
283   data.slice[data.tidx+data.tidy*T1d] = *U;
284   __syncthreads();
285   *V = 0.0;
286   if (data.tidx < Q1d && data.tidy < P1d)
287     for (CeedInt i = 0; i < Q1d; ++i)
288       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
289   __syncthreads();
290 }
291 
292 //------------------------------------------------------------------------------
293 // 2D transpose tensor contract x
294 //------------------------------------------------------------------------------
295 template <int NCOMP, int P1d, int Q1d>
296 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
297   data.slice[data.tidx+data.tidy*T1d] = *U;
298   __syncthreads();
299   *V = 0.0;
300   if (data.tidx < P1d && data.tidy < P1d)
301     for (CeedInt i = 0; i < Q1d; ++i)
302       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
303   __syncthreads();
304 }
305 
306 //------------------------------------------------------------------------------
307 // 2D transpose tensor contract and add x
308 //------------------------------------------------------------------------------
309 template <int NCOMP, int P1d, int Q1d>
310 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
311   data.slice[data.tidx+data.tidy*T1d] = *U;
312   __syncthreads();
313   if (data.tidx < P1d && data.tidy < P1d)
314     for (CeedInt i = 0; i < Q1d; ++i)
315       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
316   __syncthreads();
317 }
318 
319 //------------------------------------------------------------------------------
320 // 2D interpolate to quadrature points
321 //------------------------------------------------------------------------------
322 template <int NCOMP, int P1d, int Q1d>
323 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
324   CeedScalar r_t[1];
325   for (CeedInt comp = 0; comp < NCOMP; comp++) {
326     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
327     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
328   }
329 }
330 
331 //------------------------------------------------------------------------------
332 // 2D interpolate transpose
333 //------------------------------------------------------------------------------
334 template <int NCOMP, int P1d, int Q1d>
335 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
336   CeedScalar r_t[1];
337   for (CeedInt comp = 0; comp < NCOMP; comp++) {
338     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
339     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
340   }
341 }
342 
343 //------------------------------------------------------------------------------
344 // 2D derivatives at quadrature points
345 //------------------------------------------------------------------------------
346 template <int NCOMP, int P1d, int Q1d>
347 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
348   CeedScalar r_t[1];
349   for (CeedInt comp = 0; comp < NCOMP; comp++) {
350     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
351     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
352     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
353     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
354   }
355 }
356 
357 //------------------------------------------------------------------------------
358 // 2D derivatives transpose
359 //------------------------------------------------------------------------------
360 template <int NCOMP, int P1d, int Q1d>
361 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
362   CeedScalar r_t[1];
363   for (CeedInt comp = 0; comp < NCOMP; comp++) {
364     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
365     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
366     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
367     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
368   }
369 }
370 
371 //------------------------------------------------------------------------------
372 // 3D
373 //------------------------------------------------------------------------------
374 
375 //------------------------------------------------------------------------------
376 // L-vector -> E-vector, offsets provided
377 //------------------------------------------------------------------------------
378 template <int NCOMP, int COMPSTRIDE, int P1d>
379 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
380   if (data.tidx < P1d && data.tidy < P1d)
381     for (CeedInt z = 0; z < P1d; ++z) {
382       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
383       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
384       for (CeedInt comp = 0; comp < NCOMP; ++comp)
385         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
386     }
387 }
388 
389 //------------------------------------------------------------------------------
390 // L-vector -> E-vector, strided
391 //------------------------------------------------------------------------------
392 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
393 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
394   if (data.tidx < P1d && data.tidy < P1d)
395     for (CeedInt z = 0; z < P1d; ++z) {
396       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
397       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
398       for (CeedInt comp = 0; comp < NCOMP; ++comp)
399         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
400     }
401 }
402 
403 //------------------------------------------------------------------------------
404 // E-vector -> Q-vector, offests provided
405 //------------------------------------------------------------------------------
406 template <int NCOMP, int COMPSTRIDE, int Q1d>
407 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
408   if (data.tidx < Q1d && data.tidy < Q1d) {
409     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
410     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
411     for (CeedInt comp = 0; comp < NCOMP; ++comp)
412       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
413   }
414 }
415 
416 //------------------------------------------------------------------------------
417 // E-vector -> Q-vector, strided
418 //------------------------------------------------------------------------------
419 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
420 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
421   if (data.tidx < Q1d && data.tidy < Q1d) {
422     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
423     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
424     for (CeedInt comp = 0; comp < NCOMP; ++comp)
425       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
426   }
427 }
428 
429 //------------------------------------------------------------------------------
430 // E-vector -> L-vector, offsets provided
431 //------------------------------------------------------------------------------
432 template <int NCOMP, int COMPSTRIDE, int P1d>
433 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
434   if (data.tidx < P1d && data.tidy < P1d)
435     for (CeedInt z = 0; z < P1d; ++z) {
436       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
437       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
438       for (CeedInt comp = 0; comp < NCOMP; ++comp)
439         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
440     }
441 }
442 
443 //------------------------------------------------------------------------------
444 // E-vector -> L-vector, strided
445 //------------------------------------------------------------------------------
446 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
447 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
448   if (data.tidx < P1d && data.tidy < P1d)
449     for (CeedInt z = 0; z < P1d; ++z) {
450       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
451       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
452       for (CeedInt comp = 0; comp < NCOMP; ++comp)
453         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
454     }
455 }
456 
457 //------------------------------------------------------------------------------
458 // 3D tensor contract x
459 //------------------------------------------------------------------------------
460 template <int NCOMP, int P1d, int Q1d>
461 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
462   CeedScalar r_B[P1d];
463   for (CeedInt i = 0; i < P1d; ++i)
464     r_B[i] = B[i + data.tidx*P1d];
465 
466   for (CeedInt k = 0; k < P1d; ++k) {
467     data.slice[data.tidx+data.tidy*T1d] = U[k];
468     __syncthreads();
469     V[k] = 0.0;
470     if (data.tidx < Q1d && data.tidy < P1d)
471       for (CeedInt i = 0; i < P1d; ++i)
472         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
473     __syncthreads();
474   }
475 }
476 
477 //------------------------------------------------------------------------------
478 // 3D tensor contract y
479 //------------------------------------------------------------------------------
480 template <int NCOMP, int P1d, int Q1d>
481 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
482   CeedScalar r_B[P1d];
483   for (CeedInt i = 0; i < P1d; ++i)
484     r_B[i] = B[i + data.tidy*P1d];
485 
486   for (CeedInt k = 0; k < P1d; ++k) {
487     data.slice[data.tidx+data.tidy*T1d] = U[k];
488     __syncthreads();
489     V[k] = 0.0;
490     if (data.tidx < Q1d && data.tidy < Q1d)
491       for (CeedInt i = 0; i < P1d; ++i)
492         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
493     __syncthreads();
494   }
495 }
496 
497 //------------------------------------------------------------------------------
498 // 3D tensor contract z
499 //------------------------------------------------------------------------------
500 template <int NCOMP, int P1d, int Q1d>
501 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
502   for (CeedInt k = 0; k < Q1d; ++k) {
503     V[k] = 0.0;
504     if (data.tidx < Q1d && data.tidy < Q1d)
505       for (CeedInt i = 0; i < P1d; ++i)
506         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
507   }
508 }
509 
510 //------------------------------------------------------------------------------
511 // 3D transpose tensor contract z
512 //------------------------------------------------------------------------------
513 template <int NCOMP, int P1d, int Q1d>
514 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
515   for (CeedInt k = 0; k < P1d; ++k) {
516     V[k] = 0.0;
517     if (data.tidx < Q1d && data.tidy < Q1d)
518       for (CeedInt i = 0; i < Q1d; ++i)
519         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
520   }
521 }
522 
523 //------------------------------------------------------------------------------
524 // 3D transpose tensor contract y
525 //------------------------------------------------------------------------------
526 template <int NCOMP, int P1d, int Q1d>
527 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
528   CeedScalar r_B[Q1d];
529   for (CeedInt i = 0; i < Q1d; ++i)
530     r_B[i] = B[data.tidy + i*P1d];
531 
532   for (CeedInt k = 0; k < P1d; ++k) {
533     data.slice[data.tidx+data.tidy*T1d] = U[k];
534     __syncthreads();
535     V[k] = 0.0;
536     if (data.tidx < Q1d && data.tidy < P1d)
537       for (CeedInt i = 0; i < Q1d; ++i)
538         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
539     __syncthreads();
540   }
541 }
542 
543 //------------------------------------------------------------------------------
544 // 3D transpose tensor contract add y
545 //------------------------------------------------------------------------------
546 template <int NCOMP, int P1d, int Q1d>
547 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
548   CeedScalar r_B[Q1d];
549   for (CeedInt i = 0; i < Q1d; ++i)
550     r_B[i] = B[data.tidy + i*P1d];
551 
552   for (CeedInt k = 0; k < P1d; ++k) {
553     data.slice[data.tidx+data.tidy*T1d] = U[k];
554     __syncthreads();
555     if (data.tidx < Q1d && data.tidy < P1d)
556       for (CeedInt i = 0; i < Q1d; ++i)
557         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
558     __syncthreads();
559   }
560 }
561 
562 //------------------------------------------------------------------------------
563 // 3D transpose tensor contract x
564 //------------------------------------------------------------------------------
565 template <int NCOMP, int P1d, int Q1d>
566 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
567   CeedScalar r_B[Q1d];
568   for (CeedInt i = 0; i < Q1d; ++i)
569     r_B[i] = B[data.tidx + i*P1d];
570 
571   for (CeedInt k = 0; k < P1d; ++k) {
572     data.slice[data.tidx+data.tidy*T1d] = U[k];
573     __syncthreads();
574     V[k] = 0.0;
575     if (data.tidx < P1d && data.tidy < P1d)
576       for (CeedInt i = 0; i < Q1d; ++i)
577         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
578     __syncthreads();
579   }
580 }
581 
582 //------------------------------------------------------------------------------
583 // 3D transpose tensor contract add x
584 //------------------------------------------------------------------------------
585 template <int NCOMP, int P1d, int Q1d>
586 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
587   CeedScalar r_B[Q1d];
588   for (CeedInt i = 0; i < Q1d; ++i)
589     r_B[i] = B[data.tidx + i*P1d];
590 
591   for (CeedInt k = 0; k < P1d; ++k) {
592     data.slice[data.tidx+data.tidy*T1d] = U[k];
593     __syncthreads();
594     if (data.tidx < P1d && data.tidy < P1d)
595       for (CeedInt i = 0; i < Q1d; ++i)
596         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
597     __syncthreads();
598   }
599 }
600 
601 //------------------------------------------------------------------------------
602 // 3D interpolate to quadrature points
603 //------------------------------------------------------------------------------
604 template <int NCOMP, int P1d, int Q1d>
605 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
606   CeedScalar r_t1[T1d];
607   CeedScalar r_t2[T1d];
608   for (CeedInt comp = 0; comp < NCOMP; comp++) {
609     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
610     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
611     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
612   }
613 }
614 
615 //------------------------------------------------------------------------------
616 // 3D interpolate transpose
617 //------------------------------------------------------------------------------
618 template <int NCOMP, int P1d, int Q1d>
619 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
620   CeedScalar r_t1[T1d];
621   CeedScalar r_t2[T1d];
622   for (CeedInt comp = 0; comp < NCOMP; comp++) {
623     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
624     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
625     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
626   }
627 }
628 
629 //------------------------------------------------------------------------------
630 // 3D derivatives at quadrature points
631 //------------------------------------------------------------------------------
632 template <int NCOMP, int P1d, int Q1d>
633 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
634   CeedScalar r_t1[T1d];
635   CeedScalar r_t2[T1d];
636   for (CeedInt comp = 0; comp < NCOMP; comp++) {
637     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
638     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
639     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
640     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
641     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
642     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
643     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
644     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
645     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
646   }
647 }
648 
649 //------------------------------------------------------------------------------
650 // 3D derivatives transpose
651 //------------------------------------------------------------------------------
652 template <int NCOMP, int P1d, int Q1d>
653 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
654   CeedScalar r_t1[T1d];
655   CeedScalar r_t2[T1d];
656   for (CeedInt comp = 0; comp < NCOMP; comp++) {
657     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
658     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
659     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
660     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
661     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
662     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
663     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
664     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
665     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
666   }
667 }
668 
669 //------------------------------------------------------------------------------
670 // 3D collocated derivatives computation
671 //------------------------------------------------------------------------------
672 template <int NCOMP, int Q1d>
673 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
674   if (data.tidx < Q1d && data.tidy < Q1d) {
675     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
676       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
677       __syncthreads();
678       // X derivative
679       r_V[comp+0*NCOMP] = 0.0;
680       for (CeedInt i = 0; i < Q1d; ++i)
681         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
682       // Y derivative
683       r_V[comp+1*NCOMP] = 0.0;
684       for (CeedInt i = 0; i < Q1d; ++i)
685         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
686       // Z derivative
687       r_V[comp+2*NCOMP] = 0.0;
688       for (CeedInt i = 0; i < Q1d; ++i)
689         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
690       __syncthreads();
691     }
692   }
693 }
694 
695 //------------------------------------------------------------------------------
696 // 3D collocated derivatives transpose
697 //------------------------------------------------------------------------------
698 template <int NCOMP, int Q1d>
699 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
700   if (data.tidx < Q1d && data.tidy < Q1d) {
701     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
702       // X derivative
703       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
704       __syncthreads();
705       for (CeedInt i = 0; i < Q1d; ++i)
706         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
707       __syncthreads();
708       // Y derivative
709       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
710       __syncthreads();
711       for (CeedInt i = 0; i < Q1d; ++i)
712         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
713       __syncthreads();
714       // Z derivative
715       for (CeedInt i = 0; i < Q1d; ++i)
716         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
717     }
718   }
719 }
720 
721 //------------------------------------------------------------------------------
722 // 1D quadrature weights
723 //------------------------------------------------------------------------------
724 template <int Q1d>
725 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
726   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
727 }
728 
729 //------------------------------------------------------------------------------
730 // 2D quadrature weights
731 //------------------------------------------------------------------------------
732 template <int Q1d>
733 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
734   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
735         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
736 }
737 
738 //------------------------------------------------------------------------------
739 // 3D quadrature weights
740 //------------------------------------------------------------------------------
741 template <int Q1d>
742 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
743   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
744   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
745   for (CeedInt z = 0; z < Q1d; ++z)
746     w[z] = quad ? pw*qweight1d[z] : 0.0;
747 }
748 
749 );
750 //------------------------------------------------------------------------------
751 // Build singe operator kernel
752 //------------------------------------------------------------------------------
753 extern "C" int CeedHipGenOperatorBuild(CeedOperator op) {
754 
755   using std::ostringstream;
756   using std::string;
757   int ierr;
758   bool setupdone;
759   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
760   if (setupdone) return CEED_ERROR_SUCCESS;
761   Ceed ceed;
762   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
763   CeedOperator_Hip_gen *data;
764   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
765   CeedQFunction qf;
766   CeedQFunction_Hip_gen *qf_data;
767   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
768   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
769   CeedSize lsize;
770   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
771           numoutputfields, ncomp, dim = 1;
772   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
773   Q1d = Q;
774   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
775   CeedOperatorField *opinputfields, *opoutputfields;
776   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
777   CeedChkBackend(ierr);
778   CeedQFunctionField *qfinputfields, *qfoutputfields;
779   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
780   CeedChkBackend(ierr);
781   CeedEvalMode emode;
782   CeedBasis basis;
783   CeedBasis_Hip_shared *basis_data;
784   CeedElemRestriction Erestrict;
785   CeedElemRestriction_Hip *restr_data;
786 
787   // Check for restriction only identity operator
788   bool is_identity_qf;
789   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
790   if (is_identity_qf) {
791     CeedEvalMode emodein, emodeout;
792     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
793     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
794     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
795       // LCOV_EXCL_START
796       return CeedError(ceed, CEED_ERROR_BACKEND,
797                        "Backend does not implement restriction only identity operators");
798     // LCOV_EXCL_STOP
799   }
800 
801   ostringstream code;
802   string devFunctions(deviceFunctions);
803   code << devFunctions;
804 
805   string qFunction(qf_data->qFunctionSource);
806   string qFunctionName(qf_data->qFunctionName);
807   string oper;
808   oper = "CeedKernel_Hip_gen_" + qFunctionName;
809 
810   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
811   code << "#define CEED_QFUNCTION_HELPER __device__ __forceinline__\n";
812   code << "#define CeedPragmaSIMD\n";
813   code << "#define CEED_ERROR_SUCCESS 0\n\n";
814 
815   // Find dim and Q1d
816   bool useCollograd = true;
817   bool allCollograd = true;
818   data->maxP1d = 0;
819   for (CeedInt i = 0; i < numinputfields; i++) {
820     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
821     if (basis != CEED_BASIS_COLLOCATED) {
822       allCollograd = false;
823       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
824       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
825       CeedChkBackend(ierr);
826 
827       // Check for collocated gradient
828       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
829 
830       // Collect dim and Q1d
831       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
832       bool isTensor;
833       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
834       if (isTensor) {
835         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
836         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
837         if (P1d>data->maxP1d) data->maxP1d = P1d;
838       } else {
839         // LCOV_EXCL_START
840         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
841         // LCOV_EXCL_STOP
842         }
843     }
844   }
845   // Check output bases for Q1d, dim as well
846   //   The only imput basis might be CEED_BASIS_COLLOCATED
847   for (CeedInt i = 0; i < numoutputfields; i++) {
848     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
849 
850     if (basis != CEED_BASIS_COLLOCATED) {
851       allCollograd = false;
852       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
853       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
854       CeedChkBackend(ierr);
855 
856       // Collect dim and Q1d
857       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
858       bool isTensor;
859       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
860       if (isTensor) {
861         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
862       } else {
863         // LCOV_EXCL_START
864         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
865         // LCOV_EXCL_STOP
866         }
867 
868       // Check for collocated gradient
869       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
870     }
871   }
872   data->dim = dim;
873   data->Q1d = Q1d;
874   // TODO: https://github.com/CEED/libCEED/pull/1009#issuecomment-1176751436
875   useCollograd &= !allCollograd;
876 
877   // Define CEED_Q_VLA
878   if (dim != 3 || useCollograd) {
879     code << "\n#define CEED_Q_VLA 1\n\n";
880   } else {
881     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
882   }
883 
884   code << qFunction;
885 
886   // Setup
887   code << "\n// -----------------------------------------------------------------------------\n";
888   code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
889   code << "__global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
890   for (CeedInt i = 0; i < numinputfields; i++) {
891     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
892     CeedChkBackend(ierr);
893     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
894       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
895     }
896   }
897 
898   for (CeedInt i = 0; i < numoutputfields; i++) {
899     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
900   }
901 
902   code << "  const CeedInt Dim = "<<dim<<";\n";
903   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
904 
905   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
906   code << "  BackendData data;\n";
907   code << "  data.tidx = threadIdx.x;\n";
908   code << "  data.tidy = threadIdx.y;\n";
909   code << "  data.tidz = threadIdx.z;\n";
910   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
911   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
912 
913   code << "\n  // -- Input field constants and basis data --\n";
914   //Initialize constants, and matrices B and G
915   for (CeedInt i = 0; i < numinputfields; i++) {
916     code << "  // ---- Input field "<<i<<" ----\n";
917     // Get elemsize, emode, ncomp
918     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
919     CeedChkBackend(ierr);
920     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
921     CeedChkBackend(ierr);
922     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
923     CeedChkBackend(ierr);
924     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
925     CeedChkBackend(ierr);
926 
927     // Set field constants
928     if (emode != CEED_EVAL_WEIGHT) {
929       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
930       if (basis != CEED_BASIS_COLLOCATED) {
931         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
932         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
933       } else {
934         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
935       }
936       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
937     }
938 
939     // Load basis data
940     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
941     switch (emode) {
942     case CEED_EVAL_NONE:
943       break;
944     case CEED_EVAL_INTERP:
945       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
946       data->B.in[i] = basis_data->d_interp_1d;
947       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
948       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
949       break;
950     case CEED_EVAL_GRAD:
951       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
952       data->B.in[i] = basis_data->d_interp_1d;
953       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
954       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
955       if (useCollograd) {
956         data->G.in[i] = basis_data->d_collo_grad_1d;
957         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
958         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
959       } else {
960         data->G.in[i] = basis_data->d_grad_1d;
961         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
962         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
963       }
964       break;
965     case CEED_EVAL_WEIGHT:
966       break; // No action
967     case CEED_EVAL_DIV:
968       break; // TODO: Not implemented
969     case CEED_EVAL_CURL:
970       break; // TODO: Not implemented
971     }
972   }
973 
974   code << "\n  // -- Output field constants and basis data --\n";
975   for (CeedInt i = 0; i < numoutputfields; i++) {
976     code << "  // ---- Output field "<<i<<" ----\n";
977     // Get elemsize, emode, ncomp
978     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
979     CeedChkBackend(ierr);
980     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
981     CeedChkBackend(ierr);
982     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
983     CeedChkBackend(ierr);
984     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
985     CeedChkBackend(ierr);
986 
987     // Set field constants
988     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
989     if (basis != CEED_BASIS_COLLOCATED) {
990       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
991       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
992     } else {
993       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
994     }
995     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
996 
997     // Load basis data
998     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
999     switch (emode) {
1000     case CEED_EVAL_NONE:
1001       break; // No action
1002     case CEED_EVAL_INTERP:
1003       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1004       data->B.out[i] = basis_data->d_interp_1d;
1005       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1006       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1007       break;
1008     case CEED_EVAL_GRAD:
1009       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1010       data->B.out[i] = basis_data->d_interp_1d;
1011       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1012       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
1013       if (useCollograd) {
1014         data->G.out[i] = basis_data->d_collo_grad_1d;
1015         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
1016         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1017       } else {
1018         data->G.out[i] = basis_data->d_grad_1d;
1019         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1020         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1021       }
1022       break;
1023     // LCOV_EXCL_START
1024     case CEED_EVAL_WEIGHT: {
1025       Ceed ceed;
1026       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1027       return CeedError(ceed, CEED_ERROR_BACKEND,
1028                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1029       break; // Should not occur
1030     }
1031     case CEED_EVAL_DIV:
1032       break; // TODO: Not implemented
1033     case CEED_EVAL_CURL:
1034       break; // TODO: Not implemented
1035       // LCOV_EXCL_STOP
1036     }
1037   }
1038   code << "\n  // -- Element loop --\n";
1039   code << "  __syncthreads();\n";
1040   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1041   // Input basis apply if needed
1042   // Generate the correct eval mode code for each input
1043   code << "    // -- Input field restrictions and basis actions --\n";
1044   for (CeedInt i = 0; i < numinputfields; i++) {
1045     code << "    // ---- Input field "<<i<<" ----\n";
1046     // Get elemsize, emode, ncomp
1047     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1048     CeedChkBackend(ierr);
1049     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1050     CeedChkBackend(ierr);
1051     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1052     CeedChkBackend(ierr);
1053     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1054     CeedChkBackend(ierr);
1055 
1056     // Restriction
1057     if (emode != CEED_EVAL_WEIGHT &&
1058         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1059       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1060 
1061       bool isStrided;
1062       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1063       if (!isStrided) {
1064         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1065         CeedChkBackend(ierr);
1066         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1067         CeedInt compstride;
1068         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1069         code << "    // CompStride: "<<compstride<<"\n";
1070         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1071         data->indices.in[i] = restr_data->d_ind;
1072         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1073       } else {
1074         bool backendstrides;
1075         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1076         CeedChkBackend(ierr);
1077         CeedInt nelem;
1078         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1079         CeedChkBackend(ierr);
1080         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1081         if (!backendstrides) {
1082           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1083           CeedChkBackend(ierr);
1084         }
1085         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1086         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1087       }
1088     }
1089 
1090     // Basis action
1091     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1092     switch (emode) {
1093     case CEED_EVAL_NONE:
1094       if (!useCollograd) {
1095         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1096       }
1097       break;
1098     case CEED_EVAL_INTERP:
1099       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1100       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1101       break;
1102     case CEED_EVAL_GRAD:
1103       if (useCollograd) {
1104         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1105         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1106       } else {
1107         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1108         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1109       }
1110       break;
1111     case CEED_EVAL_WEIGHT:
1112       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1113       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1114       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1115       data->W = basis_data->d_q_weight_1d;
1116       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1117       break; // No action
1118     case CEED_EVAL_DIV:
1119       break; // TODO: Not implemented
1120     case CEED_EVAL_CURL:
1121       break; // TODO: Not implemented
1122     }
1123   }
1124 
1125   // Q function
1126   code << "\n    // -- Output field setup --\n";
1127   for (CeedInt i = 0; i < numoutputfields; i++) {
1128       code << "\n    // ---- Output field "<<i<<" ----\n";
1129     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1130     CeedChkBackend(ierr);
1131     if (emode==CEED_EVAL_GRAD)
1132     {
1133       if (useCollograd) {
1134         //Accumulator for gradient slices
1135         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1136         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1137         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1138         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1139         code << "      }\n";
1140         code << "    }\n";
1141       } else {
1142         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1143       }
1144     }
1145     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1146     {
1147       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1148     }
1149   }
1150   // We treat quadrature points per slice in 3d to save registers
1151   if (useCollograd) {
1152     code << "\n    // Note: Collocated Gradient\n";
1153     code << "#pragma unroll\n";
1154     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1155     code << "      // -- Input fields --\n";
1156     for (CeedInt i = 0; i < numinputfields; i++) {
1157       code << "      // ---- Input field "<<i<<" ----\n";
1158       // Get elemsize, emode, ncomp
1159       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1160       CeedChkBackend(ierr);
1161       // Basis action
1162       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1163       switch (emode) {
1164       case CEED_EVAL_NONE:
1165         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1166 
1167         bool isStrided;
1168         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1169         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1170         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1171         if (!isStrided) {
1172           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1173           CeedChkBackend(ierr);
1174           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1175           CeedInt compstride;
1176           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1177           code << "      // CompStride: "<<compstride<<"\n";
1178           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1179           data->indices.in[i] = restr_data->d_ind;
1180           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1181         } else {
1182           bool backendstrides;
1183           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1184           CeedChkBackend(ierr);
1185           CeedInt nelem;
1186           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1187           CeedChkBackend(ierr);
1188           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1189           if (!backendstrides) {
1190             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1191             CeedChkBackend(ierr);
1192           }
1193           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1194           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1195         }
1196         break;
1197       case CEED_EVAL_INTERP:
1198         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1199         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1200         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1201         code << "      }\n";
1202         break;
1203       case CEED_EVAL_GRAD:
1204         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1205         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1206         break;
1207       case CEED_EVAL_WEIGHT:
1208         code << "      CeedScalar r_q"<<i<<"[1];\n";
1209         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1210         break; // No action
1211       case CEED_EVAL_DIV:
1212         break; // TODO: Not implemented
1213       case CEED_EVAL_CURL:
1214         break; // TODO: Not implemented
1215       }
1216     }
1217     code << "\n      // -- Output fields --\n";
1218     for (CeedInt i = 0; i < numoutputfields; i++) {
1219       code << "      // ---- Output field "<<i<<" ----\n";
1220       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1221       CeedChkBackend(ierr);
1222       // Basis action
1223       switch (emode) {
1224       case CEED_EVAL_NONE:
1225         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1226         break; // No action
1227       case CEED_EVAL_INTERP:
1228         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1229         break;
1230       case CEED_EVAL_GRAD:
1231         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1232         break;
1233       case CEED_EVAL_WEIGHT:
1234         break; // Should not occur
1235       case CEED_EVAL_DIV:
1236         break; // TODO: Not implemented
1237       case CEED_EVAL_CURL:
1238         break; // TODO: Not implemented
1239       }
1240     }
1241   } else {
1242     code << "\n      // Note: No Collocated Gradient\n";
1243     code << "      // -- Input fields --\n";
1244     for (CeedInt i = 0; i < numinputfields; i++) {
1245       code << "      // ---- Input field "<<i<<" ----\n";
1246       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1247     }
1248     code << "      // -- Output fields --\n";
1249     for (CeedInt i = 0; i < numoutputfields; i++) {
1250       code << "      // ---- Output field "<<i<<" ----\n";
1251       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1252     }
1253   }
1254   code << "\n      // -- QFunction Inputs and outputs --\n";
1255   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1256   for (CeedInt i = 0; i < numinputfields; i++) {
1257     code << "      // ---- Input field "<<i<<" ----\n";
1258     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1259   }
1260   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1261   for (CeedInt i = 0; i < numoutputfields; i++) {
1262     code << "      // ---- Output field "<<i<<" ----\n";
1263     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1264   }
1265   code << "\n      // -- Apply QFunction --\n";
1266   code << "      "<<qFunctionName<<"(ctx, ";
1267   if (dim != 3 || useCollograd) {
1268     code << "1";
1269   } else {
1270     code << "Q1d";
1271   }
1272   code << ", in, out);\n";
1273   if (useCollograd) {
1274     code << "\n      // Note: Collocated Gradient\n";
1275     code << "      // -- Output fields --\n";
1276     for (CeedInt i = 0; i < numoutputfields; i++) {
1277       code << "      // ---- Output field "<<i<<" ----\n";
1278       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1279       CeedChkBackend(ierr);
1280       // Basis action
1281       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1282       switch (emode) {
1283       case CEED_EVAL_NONE:
1284         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1285         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1286         code << "      }\n";
1287         break; // No action
1288       case CEED_EVAL_INTERP:
1289         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1290         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1291         code << "      }\n";
1292         break;
1293       case CEED_EVAL_GRAD:
1294         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1295         break;
1296       case CEED_EVAL_WEIGHT:
1297         break; // Should not occur
1298       case CEED_EVAL_DIV:
1299         break; // TODO: Not implemented
1300       case CEED_EVAL_CURL:
1301         break; // TODO: Not implemented
1302       }
1303     }
1304     code << "    }\n";
1305   }
1306 
1307   // Output basis apply if needed
1308   // Generate the correct eval mode code for each output
1309   code << "\n    // -- Output field basis action and restrictions --\n";
1310   for (CeedInt i = 0; i < numoutputfields; i++) {
1311     code << "    // ---- Output field "<<i<<" ----\n";
1312     // Get elemsize, emode, ncomp
1313     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1314     CeedChkBackend(ierr);
1315     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1316     CeedChkBackend(ierr);
1317     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1318     CeedChkBackend(ierr);
1319     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1320     CeedChkBackend(ierr);
1321     // Basis action
1322     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1323     switch (emode) {
1324     case CEED_EVAL_NONE:
1325       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1326       break; // No action
1327     case CEED_EVAL_INTERP:
1328       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1329       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1330       break;
1331     case CEED_EVAL_GRAD:
1332       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1333       if (useCollograd) {
1334         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1335       } else {
1336         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1337       }
1338       break;
1339     // LCOV_EXCL_START
1340     case CEED_EVAL_WEIGHT: {
1341       Ceed ceed;
1342       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1343       return CeedError(ceed, CEED_ERROR_BACKEND,
1344                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1345       break; // Should not occur
1346     }
1347     case CEED_EVAL_DIV:
1348       break; // TODO: Not implemented
1349     case CEED_EVAL_CURL:
1350       break; // TODO: Not implemented
1351       // LCOV_EXCL_STOP
1352     }
1353     // Restriction
1354       bool isStrided;
1355       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1356     if (!isStrided) {
1357       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1358       CeedChkBackend(ierr);
1359       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1360       CeedInt compstride;
1361       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1362       code << "    // CompStride: "<<compstride<<"\n";
1363       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1364       data->indices.out[i] = restr_data->d_ind;
1365       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1366     } else {
1367       bool backendstrides;
1368       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1369       CeedChkBackend(ierr);
1370       CeedInt nelem;
1371       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1372       CeedChkBackend(ierr);
1373       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1374       if (!backendstrides) {
1375         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1376         CeedChkBackend(ierr);
1377       }
1378       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1379       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1380     }
1381   }
1382 
1383   code << "  }\n";
1384   code << "}\n";
1385   code << "// -----------------------------------------------------------------------------\n\n";
1386 
1387   // View kernel for debugging
1388   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
1389   CeedDebug(ceed, code.str().c_str());
1390 
1391   CeedInt block_sizes[3] = {0, 0, 0};
1392   ierr = BlockGridCalculate_Hip_gen(dim, numelements, data->maxP1d, Q1d, block_sizes);
1393   CeedChkBackend(ierr);
1394   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2,
1395                          "T1d", block_sizes[0],
1396                          "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]);
1397   CeedChkBackend(ierr);
1398   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1399   CeedChkBackend(ierr);
1400 
1401   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1402   return CEED_ERROR_SUCCESS;
1403 }
1404 //------------------------------------------------------------------------------
1405