xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision dcefb99bf4e43737baea662a94271a0fc9c30aca)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #define CEED_DEBUG_COLOR 12
18 
19 #include <ceed/ceed.h>
20 #include <ceed/backend.h>
21 #include <iostream>
22 #include <string>
23 #include <sstream>
24 #include "ceed-hip-gen.h"
25 #include "../hip/ceed-hip.h"
26 #include "../hip-shared/ceed-hip-shared.h"
27 #include "../hip/ceed-hip-compile.h"
28 
29 static const char *deviceFunctions = QUOTE(
30 
31 //------------------------------------------------------------------------------
32 // Typedefs
33 //------------------------------------------------------------------------------
34 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
35 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
36 
37 typedef struct {
38   CeedInt tidx;
39   CeedInt tidy;
40   CeedInt tidz;
41   CeedInt tid;
42   CeedScalar* slice;
43 } BackendData;
44 
45 //------------------------------------------------------------------------------
46 // Load matrices for basis actions
47 //------------------------------------------------------------------------------
48 template <int P, int Q>
49 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
50   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
51     B[i] = d_B[i];
52 }
53 
54 //------------------------------------------------------------------------------
55 // 1D
56 //------------------------------------------------------------------------------
57 
58 //------------------------------------------------------------------------------
59 // L-vector -> E-vector, offsets provided
60 //------------------------------------------------------------------------------
61 template <int NCOMP, int COMPSTRIDE, int P1d>
62 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
63   if (data.tidx < P1d) {
64     const CeedInt node = data.tidx;
65     const CeedInt ind = indices[node + elem * P1d];
66     for (CeedInt comp = 0; comp < NCOMP; ++comp)
67       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
68   }
69 }
70 
71 //------------------------------------------------------------------------------
72 // L-vector -> E-vector, strided
73 //------------------------------------------------------------------------------
74 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
75 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
76   if (data.tidx < P1d) {
77     const CeedInt node = data.tidx;
78     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
79     for (CeedInt comp = 0; comp < NCOMP; ++comp)
80       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
81   }
82 }
83 
84 //------------------------------------------------------------------------------
85 // E-vector -> L-vector, offsets provided
86 //------------------------------------------------------------------------------
87 template <int NCOMP, int COMPSTRIDE, int P1d>
88 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
89   if (data.tidx < P1d) {
90     const CeedInt node = data.tidx;
91     const CeedInt ind = indices[node + elem * P1d];
92     for (CeedInt comp = 0; comp < NCOMP; ++comp)
93       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
94   }
95 }
96 
97 //------------------------------------------------------------------------------
98 // E-vector -> L-vector, strided
99 //------------------------------------------------------------------------------
100 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
101 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
102   if (data.tidx < P1d) {
103     const CeedInt node = data.tidx;
104     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
105     for (CeedInt comp = 0; comp < NCOMP; ++comp)
106       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
107   }
108 }
109 
110 //------------------------------------------------------------------------------
111 // 1D tensor contraction x
112 //------------------------------------------------------------------------------
113 template <int NCOMP, int P1d, int Q1d>
114 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
115   data.slice[data.tidx] = *U;
116   __syncthreads();
117   *V = 0.0;
118   if (data.tidx < Q1d)
119     for (CeedInt i = 0; i < P1d; ++i)
120       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
121   __syncthreads();
122 }
123 
124 //------------------------------------------------------------------------------
125 // 1D transpose tensor contraction x
126 //------------------------------------------------------------------------------
127 template <int NCOMP, int P1d, int Q1d>
128 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
129   data.slice[data.tidx] = *U;
130   __syncthreads();
131   *V = 0.0;
132   if (data.tidx < P1d)
133     for (CeedInt i = 0; i < Q1d; ++i)
134       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
135   __syncthreads();
136 }
137 
138 //------------------------------------------------------------------------------
139 // 1D interpolate to quadrature points
140 //------------------------------------------------------------------------------
141 template <int NCOMP, int P1d, int Q1d>
142 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
143   for (CeedInt comp = 0; comp < NCOMP; comp++)
144     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
145 }
146 
147 //------------------------------------------------------------------------------
148 // 1D interpolate transpose
149 //------------------------------------------------------------------------------
150 template <int NCOMP, int P1d, int Q1d>
151 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
152   for (CeedInt comp=0; comp<NCOMP; comp++)
153     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
154 }
155 
156 //------------------------------------------------------------------------------
157 // 1D derivatives at quadrature points
158 //------------------------------------------------------------------------------
159 template <int NCOMP, int P1d, int Q1d>
160 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
161   for (CeedInt comp = 0; comp < NCOMP; comp++)
162     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
163 }
164 
165 //------------------------------------------------------------------------------
166 // 1D derivatives transpose
167 //------------------------------------------------------------------------------
168 template <int NCOMP, int P1d, int Q1d>
169 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
170   for (CeedInt comp = 0; comp < NCOMP; comp++)
171     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
172 }
173 
174 //------------------------------------------------------------------------------
175 // 2D
176 //------------------------------------------------------------------------------
177 
178 //------------------------------------------------------------------------------
179 // L-vector -> E-vector, offsets provided
180 //------------------------------------------------------------------------------
181 template <int NCOMP, int COMPSTRIDE, int P1d>
182 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
183   if (data.tidx < P1d && data.tidy < P1d) {
184     const CeedInt node = data.tidx + data.tidy*P1d;
185     const CeedInt ind = indices[node + elem * P1d*P1d];
186     for (CeedInt comp = 0; comp < NCOMP; ++comp)
187       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
188   }
189 }
190 
191 //------------------------------------------------------------------------------
192 // L-vector -> E-vector, strided
193 //------------------------------------------------------------------------------
194 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
195 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
196   if (data.tidx < P1d && data.tidy < P1d) {
197     const CeedInt node = data.tidx + data.tidy*P1d;
198     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
199     for (CeedInt comp = 0; comp < NCOMP; ++comp)
200       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
201   }
202 }
203 
204 //------------------------------------------------------------------------------
205 // E-vector -> L-vector, offsets provided
206 //------------------------------------------------------------------------------
207 template <int NCOMP, int COMPSTRIDE, int P1d>
208 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
209   if (data.tidx < P1d && data.tidy < P1d) {
210     const CeedInt node = data.tidx + data.tidy*P1d;
211     const CeedInt ind = indices[node + elem * P1d*P1d];
212     for (CeedInt comp = 0; comp < NCOMP; ++comp)
213       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
214   }
215 }
216 
217 //------------------------------------------------------------------------------
218 // E-vector -> L-vector, strided
219 //------------------------------------------------------------------------------
220 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
221 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
222   if (data.tidx < P1d && data.tidy < P1d) {
223     const CeedInt node = data.tidx + data.tidy*P1d;
224     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
225     for (CeedInt comp = 0; comp < NCOMP; ++comp)
226       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
227   }
228 }
229 
230 //------------------------------------------------------------------------------
231 // 2D tensor contraction x
232 //------------------------------------------------------------------------------
233 template <int NCOMP, int P1d, int Q1d>
234 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
235   data.slice[data.tidx+data.tidy*T1d] = *U;
236   __syncthreads();
237   *V = 0.0;
238   if (data.tidx < Q1d && data.tidy < P1d)
239     for (CeedInt i = 0; i < P1d; ++i)
240       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
241   __syncthreads();
242 }
243 
244 //------------------------------------------------------------------------------
245 // 2D tensor contract y
246 //------------------------------------------------------------------------------
247 template <int NCOMP, int P1d, int Q1d>
248 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
249   data.slice[data.tidx+data.tidy*T1d] = *U;
250   __syncthreads();
251   *V = 0.0;
252   if (data.tidx < Q1d && data.tidy < Q1d)
253     for (CeedInt i = 0; i < P1d; ++i)
254       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
255   __syncthreads();
256 }
257 
258 //------------------------------------------------------------------------------
259 // 2D transpose tensor contract y
260 //------------------------------------------------------------------------------
261 template <int NCOMP, int P1d, int Q1d>
262 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
263   data.slice[data.tidx+data.tidy*T1d] = *U;
264   __syncthreads();
265   *V = 0.0;
266   if (data.tidx < Q1d && data.tidy < P1d)
267     for (CeedInt i = 0; i < Q1d; ++i)
268       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
269   __syncthreads();
270 }
271 
272 //------------------------------------------------------------------------------
273 // 2D transpose tensor contract x
274 //------------------------------------------------------------------------------
275 template <int NCOMP, int P1d, int Q1d>
276 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
277   data.slice[data.tidx+data.tidy*T1d] = *U;
278   __syncthreads();
279   *V = 0.0;
280   if (data.tidx < P1d && data.tidy < P1d)
281     for (CeedInt i = 0; i < Q1d; ++i)
282       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
283   __syncthreads();
284 }
285 
286 //------------------------------------------------------------------------------
287 // 2D transpose tensor contract and add x
288 //------------------------------------------------------------------------------
289 template <int NCOMP, int P1d, int Q1d>
290 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
291   data.slice[data.tidx+data.tidy*T1d] = *U;
292   __syncthreads();
293   if (data.tidx < P1d && data.tidy < P1d)
294     for (CeedInt i = 0; i < Q1d; ++i)
295       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
296   __syncthreads();
297 }
298 
299 //------------------------------------------------------------------------------
300 // 2D interpolate to quadrature points
301 //------------------------------------------------------------------------------
302 template <int NCOMP, int P1d, int Q1d>
303 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
304   CeedScalar r_t[1];
305   for (CeedInt comp = 0; comp < NCOMP; comp++) {
306     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
307     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
308   }
309 }
310 
311 //------------------------------------------------------------------------------
312 // 2D interpolate transpose
313 //------------------------------------------------------------------------------
314 template <int NCOMP, int P1d, int Q1d>
315 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
316   CeedScalar r_t[1];
317   for (CeedInt comp = 0; comp < NCOMP; comp++) {
318     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
319     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
320   }
321 }
322 
323 //------------------------------------------------------------------------------
324 // 2D derivatives at quadrature points
325 //------------------------------------------------------------------------------
326 template <int NCOMP, int P1d, int Q1d>
327 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
328   CeedScalar r_t[1];
329   for (CeedInt comp = 0; comp < NCOMP; comp++) {
330     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
331     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
332     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
333     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
334   }
335 }
336 
337 //------------------------------------------------------------------------------
338 // 2D derivatives transpose
339 //------------------------------------------------------------------------------
340 template <int NCOMP, int P1d, int Q1d>
341 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
342   CeedScalar r_t[1];
343   for (CeedInt comp = 0; comp < NCOMP; comp++) {
344     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
345     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
346     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
347     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
348   }
349 }
350 
351 //------------------------------------------------------------------------------
352 // 3D
353 //------------------------------------------------------------------------------
354 
355 //------------------------------------------------------------------------------
356 // L-vector -> E-vector, offsets provided
357 //------------------------------------------------------------------------------
358 template <int NCOMP, int COMPSTRIDE, int P1d>
359 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
360   if (data.tidx < P1d && data.tidy < P1d)
361     for (CeedInt z = 0; z < P1d; ++z) {
362       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
363       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
364       for (CeedInt comp = 0; comp < NCOMP; ++comp)
365         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
366     }
367 }
368 
369 //------------------------------------------------------------------------------
370 // L-vector -> E-vector, strided
371 //------------------------------------------------------------------------------
372 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
373 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
374   if (data.tidx < P1d && data.tidy < P1d)
375     for (CeedInt z = 0; z < P1d; ++z) {
376       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
377       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
378       for (CeedInt comp = 0; comp < NCOMP; ++comp)
379         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
380     }
381 }
382 
383 //------------------------------------------------------------------------------
384 // E-vector -> Q-vector, offests provided
385 //------------------------------------------------------------------------------
386 template <int NCOMP, int COMPSTRIDE, int Q1d>
387 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
388   if (data.tidx < Q1d && data.tidy < Q1d) {
389     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
390     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
391     for (CeedInt comp = 0; comp < NCOMP; ++comp)
392       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
393   }
394 }
395 
396 //------------------------------------------------------------------------------
397 // E-vector -> Q-vector, strided
398 //------------------------------------------------------------------------------
399 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
400 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
401   if (data.tidx < Q1d && data.tidy < Q1d) {
402     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
403     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
404     for (CeedInt comp = 0; comp < NCOMP; ++comp)
405       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
406   }
407 }
408 
409 //------------------------------------------------------------------------------
410 // E-vector -> L-vector, offsets provided
411 //------------------------------------------------------------------------------
412 template <int NCOMP, int COMPSTRIDE, int P1d>
413 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
414   if (data.tidx < P1d && data.tidy < P1d)
415     for (CeedInt z = 0; z < P1d; ++z) {
416       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
417       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
418       for (CeedInt comp = 0; comp < NCOMP; ++comp)
419         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
420     }
421 }
422 
423 //------------------------------------------------------------------------------
424 // E-vector -> L-vector, strided
425 //------------------------------------------------------------------------------
426 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
427 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
428   if (data.tidx < P1d && data.tidy < P1d)
429     for (CeedInt z = 0; z < P1d; ++z) {
430       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
431       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
432       for (CeedInt comp = 0; comp < NCOMP; ++comp)
433         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
434     }
435 }
436 
437 //------------------------------------------------------------------------------
438 // 3D tensor contract x
439 //------------------------------------------------------------------------------
440 template <int NCOMP, int P1d, int Q1d>
441 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
442   CeedScalar r_B[P1d];
443   for (CeedInt i = 0; i < P1d; ++i)
444     r_B[i] = B[i + data.tidx*P1d];
445 
446   for (CeedInt k = 0; k < P1d; ++k) {
447     data.slice[data.tidx+data.tidy*T1d] = U[k];
448     __syncthreads();
449     V[k] = 0.0;
450     if (data.tidx < Q1d && data.tidy < P1d)
451       for (CeedInt i = 0; i < P1d; ++i)
452         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
453     __syncthreads();
454   }
455 }
456 
457 //------------------------------------------------------------------------------
458 // 3D tensor contract y
459 //------------------------------------------------------------------------------
460 template <int NCOMP, int P1d, int Q1d>
461 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
462   CeedScalar r_B[P1d];
463   for (CeedInt i = 0; i < P1d; ++i)
464     r_B[i] = B[i + data.tidy*P1d];
465 
466   for (CeedInt k = 0; k < P1d; ++k) {
467     data.slice[data.tidx+data.tidy*T1d] = U[k];
468     __syncthreads();
469     V[k] = 0.0;
470     if (data.tidx < Q1d && data.tidy < Q1d)
471       for (CeedInt i = 0; i < P1d; ++i)
472         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
473     __syncthreads();
474   }
475 }
476 
477 //------------------------------------------------------------------------------
478 // 3D tensor contract z
479 //------------------------------------------------------------------------------
480 template <int NCOMP, int P1d, int Q1d>
481 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
482   for (CeedInt k = 0; k < Q1d; ++k) {
483     V[k] = 0.0;
484     if (data.tidx < Q1d && data.tidy < Q1d)
485       for (CeedInt i = 0; i < P1d; ++i)
486         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
487   }
488 }
489 
490 //------------------------------------------------------------------------------
491 // 3D transpose tensor contract z
492 //------------------------------------------------------------------------------
493 template <int NCOMP, int P1d, int Q1d>
494 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
495   for (CeedInt k = 0; k < P1d; ++k) {
496     V[k] = 0.0;
497     if (data.tidx < Q1d && data.tidy < Q1d)
498       for (CeedInt i = 0; i < Q1d; ++i)
499         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
500   }
501 }
502 
503 //------------------------------------------------------------------------------
504 // 3D transpose tensor contract y
505 //------------------------------------------------------------------------------
506 template <int NCOMP, int P1d, int Q1d>
507 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
508   CeedScalar r_B[Q1d];
509   for (CeedInt i = 0; i < Q1d; ++i)
510     r_B[i] = B[data.tidy + i*P1d];
511 
512   for (CeedInt k = 0; k < P1d; ++k) {
513     data.slice[data.tidx+data.tidy*T1d] = U[k];
514     __syncthreads();
515     V[k] = 0.0;
516     if (data.tidx < Q1d && data.tidy < P1d)
517       for (CeedInt i = 0; i < Q1d; ++i)
518         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
519     __syncthreads();
520   }
521 }
522 
523 //------------------------------------------------------------------------------
524 // 3D transpose tensor contract add y
525 //------------------------------------------------------------------------------
526 template <int NCOMP, int P1d, int Q1d>
527 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
528   CeedScalar r_B[Q1d];
529   for (CeedInt i = 0; i < Q1d; ++i)
530     r_B[i] = B[data.tidy + i*P1d];
531 
532   for (CeedInt k = 0; k < P1d; ++k) {
533     data.slice[data.tidx+data.tidy*T1d] = U[k];
534     __syncthreads();
535     if (data.tidx < Q1d && data.tidy < P1d)
536       for (CeedInt i = 0; i < Q1d; ++i)
537         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
538     __syncthreads();
539   }
540 }
541 
542 //------------------------------------------------------------------------------
543 // 3D transpose tensor contract x
544 //------------------------------------------------------------------------------
545 template <int NCOMP, int P1d, int Q1d>
546 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
547   CeedScalar r_B[Q1d];
548   for (CeedInt i = 0; i < Q1d; ++i)
549     r_B[i] = B[data.tidx + i*P1d];
550 
551   for (CeedInt k = 0; k < P1d; ++k) {
552     data.slice[data.tidx+data.tidy*T1d] = U[k];
553     __syncthreads();
554     V[k] = 0.0;
555     if (data.tidx < P1d && data.tidy < P1d)
556       for (CeedInt i = 0; i < Q1d; ++i)
557         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
558     __syncthreads();
559   }
560 }
561 
562 //------------------------------------------------------------------------------
563 // 3D transpose tensor contract add x
564 //------------------------------------------------------------------------------
565 template <int NCOMP, int P1d, int Q1d>
566 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
567   CeedScalar r_B[Q1d];
568   for (CeedInt i = 0; i < Q1d; ++i)
569     r_B[i] = B[data.tidx + i*P1d];
570 
571   for (CeedInt k = 0; k < P1d; ++k) {
572     data.slice[data.tidx+data.tidy*T1d] = U[k];
573     __syncthreads();
574     if (data.tidx < P1d && data.tidy < P1d)
575       for (CeedInt i = 0; i < Q1d; ++i)
576         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
577     __syncthreads();
578   }
579 }
580 
581 //------------------------------------------------------------------------------
582 // 3D interpolate to quadrature points
583 //------------------------------------------------------------------------------
584 template <int NCOMP, int P1d, int Q1d>
585 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
586   CeedScalar r_t1[T1d];
587   CeedScalar r_t2[T1d];
588   for (CeedInt comp = 0; comp < NCOMP; comp++) {
589     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
590     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
591     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
592   }
593 }
594 
595 //------------------------------------------------------------------------------
596 // 3D interpolate transpose
597 //------------------------------------------------------------------------------
598 template <int NCOMP, int P1d, int Q1d>
599 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
600   CeedScalar r_t1[T1d];
601   CeedScalar r_t2[T1d];
602   for (CeedInt comp = 0; comp < NCOMP; comp++) {
603     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
604     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
605     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
606   }
607 }
608 
609 //------------------------------------------------------------------------------
610 // 3D derivatives at quadrature points
611 //------------------------------------------------------------------------------
612 template <int NCOMP, int P1d, int Q1d>
613 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
614   CeedScalar r_t1[T1d];
615   CeedScalar r_t2[T1d];
616   for (CeedInt comp = 0; comp < NCOMP; comp++) {
617     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
618     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
619     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
620     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
621     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
622     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
623     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
624     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
625     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
626   }
627 }
628 
629 //------------------------------------------------------------------------------
630 // 3D derivatives transpose
631 //------------------------------------------------------------------------------
632 template <int NCOMP, int P1d, int Q1d>
633 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
634   CeedScalar r_t1[T1d];
635   CeedScalar r_t2[T1d];
636   for (CeedInt comp = 0; comp < NCOMP; comp++) {
637     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
638     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
639     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
640     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
641     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
642     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
643     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
644     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
645     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
646   }
647 }
648 
649 //------------------------------------------------------------------------------
650 // 3D collocated derivatives computation
651 //------------------------------------------------------------------------------
652 template <int NCOMP, int Q1d>
653 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
654   if (data.tidx < Q1d && data.tidy < Q1d) {
655     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
656       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
657       __syncthreads();
658       // X derivative
659       r_V[comp+0*NCOMP] = 0.0;
660       for (CeedInt i = 0; i < Q1d; ++i)
661         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
662       // Y derivative
663       r_V[comp+1*NCOMP] = 0.0;
664       for (CeedInt i = 0; i < Q1d; ++i)
665         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
666       // Z derivative
667       r_V[comp+2*NCOMP] = 0.0;
668       for (CeedInt i = 0; i < Q1d; ++i)
669         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
670       __syncthreads();
671     }
672   }
673 }
674 
675 //------------------------------------------------------------------------------
676 // 3D collocated derivatives transpose
677 //------------------------------------------------------------------------------
678 template <int NCOMP, int Q1d>
679 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
680   if (data.tidx < Q1d && data.tidy < Q1d) {
681     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
682       // X derivative
683       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
684       __syncthreads();
685       for (CeedInt i = 0; i < Q1d; ++i)
686         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
687       __syncthreads();
688       // Y derivative
689       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
690       __syncthreads();
691       for (CeedInt i = 0; i < Q1d; ++i)
692         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
693       __syncthreads();
694       // Z derivative
695       for (CeedInt i = 0; i < Q1d; ++i)
696         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
697     }
698   }
699 }
700 
701 //------------------------------------------------------------------------------
702 // 1D quadrature weights
703 //------------------------------------------------------------------------------
704 template <int Q1d>
705 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
706   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
707 }
708 
709 //------------------------------------------------------------------------------
710 // 2D quadrature weights
711 //------------------------------------------------------------------------------
712 template <int Q1d>
713 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
714   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
715         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
716 }
717 
718 //------------------------------------------------------------------------------
719 // 3D quadrature weights
720 //------------------------------------------------------------------------------
721 template <int Q1d>
722 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
723   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
724   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
725   for (CeedInt z = 0; z < Q1d; ++z)
726     w[z] = quad ? pw*qweight1d[z] : 0.0;
727 }
728 
729 );
730 //------------------------------------------------------------------------------
731 // Build singe operator kernel
732 //------------------------------------------------------------------------------
733 CEED_INTERN int CeedHipGenOperatorBuild(CeedOperator op) {
734 
735   using std::ostringstream;
736   using std::string;
737   int ierr;
738   bool setupdone;
739   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
740   if (setupdone) return CEED_ERROR_SUCCESS;
741   Ceed ceed;
742   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
743   CeedOperator_Hip_gen *data;
744   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
745   CeedQFunction qf;
746   CeedQFunction_Hip_gen *qf_data;
747   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
748   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
749   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
750           numoutputfields, ncomp, dim = 0, lsize;
751   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
752   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
753   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
754   CeedChkBackend(ierr);
755   CeedOperatorField *opinputfields, *opoutputfields;
756   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
757   CeedChkBackend(ierr);
758   CeedQFunctionField *qfinputfields, *qfoutputfields;
759   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
760   CeedChkBackend(ierr);
761   CeedEvalMode emode;
762   CeedBasis basis;
763   CeedBasis_Hip_shared *basis_data;
764   CeedElemRestriction Erestrict;
765   CeedElemRestriction_Hip *restr_data;
766 
767   // Check for restriction only identity operator
768   bool is_identity_qf;
769   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
770   if (is_identity_qf) {
771     CeedEvalMode emodein, emodeout;
772     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
773     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
774     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
775       // LCOV_EXCL_START
776       return CeedError(ceed, CEED_ERROR_BACKEND,
777                        "Backend does not implement restriction only identity operators");
778     // LCOV_EXCL_STOP
779   }
780 
781   ostringstream code;
782   string devFunctions(deviceFunctions);
783   code << devFunctions;
784 
785   string qFunction(qf_data->qFunctionSource);
786   string qFunctionName(qf_data->qFunctionName);
787   string oper;
788   oper = "CeedKernel_Hip_gen_" + qFunctionName;
789 
790   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
791   code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n";
792   code << "#define CeedPragmaSIMD\n";
793   code << "#define CEED_ERROR_SUCCESS 0\n\n";
794 
795   // Find dim and Q1d
796   bool useCollograd = true;
797   data->maxP1d = 0;
798   for (CeedInt i = 0; i < numinputfields; i++) {
799     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
800     if (basis != CEED_BASIS_COLLOCATED) {
801       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
802       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
803       CeedChkBackend(ierr);
804 
805       // Check for collocated gradient
806       useCollograd = useCollograd && basis_data->d_collograd1d;
807 
808       // Collect dim and Q1d
809       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
810       bool isTensor;
811       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
812       if (isTensor) {
813         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
814         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
815         if (P1d>data->maxP1d) data->maxP1d = P1d;
816       } else {
817         // LCOV_EXCL_START
818         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
819         // LCOV_EXCL_STOP
820         }
821     }
822   }
823   // Check output bases for Q1d, dim as well
824   //   The only imput basis might be CEED_BASIS_COLLOCATED
825   for (CeedInt i = 0; i < numoutputfields; i++) {
826     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
827 
828     if (basis != CEED_BASIS_COLLOCATED) {
829       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
830       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
831       CeedChkBackend(ierr);
832 
833       // Collect dim and Q1d
834       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
835       bool isTensor;
836       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
837       if (isTensor) {
838         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
839       } else {
840         // LCOV_EXCL_START
841         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
842         // LCOV_EXCL_STOP
843         }
844 
845       // Check for collocated gradient
846       useCollograd = useCollograd && basis_data->d_collograd1d;
847     }
848   }
849   data->dim = dim;
850   data->Q1d = Q1d;
851 
852   // Define CEED_Q_VLA
853   if (dim != 3 || useCollograd) {
854     code << "\n#define CEED_Q_VLA 1\n\n";
855   } else {
856     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
857   }
858 
859   code << qFunction;
860 
861   // Setup
862   code << "\n// -----------------------------------------------------------------------------\n";
863   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
864   for (CeedInt i = 0; i < numinputfields; i++) {
865     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
866     CeedChkBackend(ierr);
867     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
868       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
869     }
870   }
871 
872   for (CeedInt i = 0; i < numoutputfields; i++) {
873     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
874   }
875 
876   code << "  const CeedInt Dim = "<<dim<<";\n";
877   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
878 
879   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
880   code << "  BackendData data;\n";
881   code << "  data.tidx = threadIdx.x;\n";
882   code << "  data.tidy = threadIdx.y;\n";
883   code << "  data.tidz = threadIdx.z;\n";
884   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
885   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
886 
887   code << "\n  // -- Input field constants and basis data --\n";
888   //Initialize constants, and matrices B and G
889   for (CeedInt i = 0; i < numinputfields; i++) {
890     code << "  // ---- Input field "<<i<<" ----\n";
891     // Get elemsize, emode, ncomp
892     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
893     CeedChkBackend(ierr);
894     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
895     CeedChkBackend(ierr);
896     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
897     CeedChkBackend(ierr);
898     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
899     CeedChkBackend(ierr);
900 
901     // Set field constants
902     if (emode != CEED_EVAL_WEIGHT) {
903       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
904       if (basis != CEED_BASIS_COLLOCATED) {
905         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
906         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
907       } else {
908         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
909       }
910       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
911     }
912 
913     // Load basis data
914     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
915     switch (emode) {
916     case CEED_EVAL_NONE:
917       break;
918     case CEED_EVAL_INTERP:
919       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
920       data->B.in[i] = basis_data->d_interp1d;
921       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
922       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
923       break;
924     case CEED_EVAL_GRAD:
925       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
926       data->B.in[i] = basis_data->d_interp1d;
927       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
928       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
929       if (useCollograd) {
930         data->G.in[i] = basis_data->d_collograd1d;
931         code << "  __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
932         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
933       } else {
934         data->G.in[i] = basis_data->d_grad1d;
935         code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
936         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
937       }
938       break;
939     case CEED_EVAL_WEIGHT:
940       break; // No action
941     case CEED_EVAL_DIV:
942       break; // TODO: Not implemented
943     case CEED_EVAL_CURL:
944       break; // TODO: Not implemented
945     }
946   }
947 
948   code << "\n  // -- Output field constants and basis data --\n";
949   for (CeedInt i = 0; i < numoutputfields; i++) {
950     code << "  // ---- Output field "<<i<<" ----\n";
951     // Get elemsize, emode, ncomp
952     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
953     CeedChkBackend(ierr);
954     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
955     CeedChkBackend(ierr);
956     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
957     CeedChkBackend(ierr);
958     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
959     CeedChkBackend(ierr);
960 
961     // Set field constants
962     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
963     if (basis != CEED_BASIS_COLLOCATED) {
964       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
965       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
966     } else {
967       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
968     }
969     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
970 
971     // Load basis data
972     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
973     switch (emode) {
974     case CEED_EVAL_NONE:
975       break; // No action
976     case CEED_EVAL_INTERP:
977       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
978       data->B.out[i] = basis_data->d_interp1d;
979       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
980       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
981       break;
982     case CEED_EVAL_GRAD:
983       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
984       data->B.out[i] = basis_data->d_interp1d;
985       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
986       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
987       if (useCollograd) {
988         data->G.out[i] = basis_data->d_collograd1d;
989         code << "  __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
990         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
991       } else {
992         data->G.out[i] = basis_data->d_grad1d;
993         code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
994         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
995       }
996       break;
997     // LCOV_EXCL_START
998     case CEED_EVAL_WEIGHT: {
999       Ceed ceed;
1000       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1001       return CeedError(ceed, CEED_ERROR_BACKEND,
1002                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1003       break; // Should not occur
1004     }
1005     case CEED_EVAL_DIV:
1006       break; // TODO: Not implemented
1007     case CEED_EVAL_CURL:
1008       break; // TODO: Not implemented
1009       // LCOV_EXCL_STOP
1010     }
1011   }
1012   code << "\n  // -- Element loop --\n";
1013   code << "  __syncthreads();\n";
1014   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1015   // Input basis apply if needed
1016   // Generate the correct eval mode code for each input
1017   code << "    // -- Input field restrictions and basis actions --\n";
1018   for (CeedInt i = 0; i < numinputfields; i++) {
1019     code << "    // ---- Input field "<<i<<" ----\n";
1020     // Get elemsize, emode, ncomp
1021     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1022     CeedChkBackend(ierr);
1023     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1024     CeedChkBackend(ierr);
1025     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1026     CeedChkBackend(ierr);
1027     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1028     CeedChkBackend(ierr);
1029 
1030     // Restriction
1031     if (emode != CEED_EVAL_WEIGHT &&
1032         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1033       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1034 
1035       bool isStrided;
1036       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1037       if (!isStrided) {
1038         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1039         CeedChkBackend(ierr);
1040         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1041         CeedInt compstride;
1042         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1043         code << "    // CompStride: "<<compstride<<"\n";
1044         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1045         data->indices.in[i] = restr_data->d_ind;
1046         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1047       } else {
1048         bool backendstrides;
1049         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1050         CeedChkBackend(ierr);
1051         CeedInt nelem;
1052         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1053         CeedChkBackend(ierr);
1054         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1055         if (!backendstrides) {
1056           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1057           CeedChkBackend(ierr);
1058         }
1059         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1060         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1061       }
1062     }
1063 
1064     // Basis action
1065     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1066     switch (emode) {
1067     case CEED_EVAL_NONE:
1068       if (!useCollograd) {
1069         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1070       }
1071       break;
1072     case CEED_EVAL_INTERP:
1073       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1074       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1075       break;
1076     case CEED_EVAL_GRAD:
1077       if (useCollograd) {
1078         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1079         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1080       } else {
1081         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1082         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1083       }
1084       break;
1085     case CEED_EVAL_WEIGHT:
1086       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1087       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1088       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1089       data->W = basis_data->d_qweight1d;
1090       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1091       break; // No action
1092     case CEED_EVAL_DIV:
1093       break; // TODO: Not implemented
1094     case CEED_EVAL_CURL:
1095       break; // TODO: Not implemented
1096     }
1097   }
1098 
1099   // Q function
1100   code << "\n    // -- Output field setup --\n";
1101   for (CeedInt i = 0; i < numoutputfields; i++) {
1102       code << "\n    // ---- Output field "<<i<<" ----\n";
1103     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1104     CeedChkBackend(ierr);
1105     if (emode==CEED_EVAL_GRAD)
1106     {
1107       if (useCollograd) {
1108         //Accumulator for gradient slices
1109         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1110         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1111         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1112         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1113         code << "      }\n";
1114         code << "    }\n";
1115       } else {
1116         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1117       }
1118     }
1119     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1120     {
1121       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1122     }
1123   }
1124   // We treat quadrature points per slice in 3d to save registers
1125   if (useCollograd) {
1126     code << "\n    // Note: Collocated Gradient\n";
1127     code << "#pragma unroll\n";
1128     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1129     code << "      // -- Input fields --\n";
1130     for (CeedInt i = 0; i < numinputfields; i++) {
1131       code << "      // ---- Input field "<<i<<" ----\n";
1132       // Get elemsize, emode, ncomp
1133       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1134       CeedChkBackend(ierr);
1135       // Basis action
1136       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1137       switch (emode) {
1138       case CEED_EVAL_NONE:
1139         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1140 
1141         bool isStrided;
1142         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1143         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1144         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1145         if (!isStrided) {
1146           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1147           CeedChkBackend(ierr);
1148           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1149           CeedInt compstride;
1150           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1151           code << "      // CompStride: "<<compstride<<"\n";
1152           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1153           data->indices.in[i] = restr_data->d_ind;
1154           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1155         } else {
1156           bool backendstrides;
1157           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1158           CeedChkBackend(ierr);
1159           CeedInt nelem;
1160           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1161           CeedChkBackend(ierr);
1162           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1163           if (!backendstrides) {
1164             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1165             CeedChkBackend(ierr);
1166           }
1167           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1168           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1169         }
1170         break;
1171       case CEED_EVAL_INTERP:
1172         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1173         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1174         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1175         code << "      }\n";
1176         break;
1177       case CEED_EVAL_GRAD:
1178         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1179         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1180         break;
1181       case CEED_EVAL_WEIGHT:
1182         code << "      CeedScalar r_q"<<i<<"[1];\n";
1183         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1184         break; // No action
1185       case CEED_EVAL_DIV:
1186         break; // TODO: Not implemented
1187       case CEED_EVAL_CURL:
1188         break; // TODO: Not implemented
1189       }
1190     }
1191     code << "\n      // -- Output fields --\n";
1192     for (CeedInt i = 0; i < numoutputfields; i++) {
1193       code << "      // ---- Output field "<<i<<" ----\n";
1194       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1195       CeedChkBackend(ierr);
1196       // Basis action
1197       switch (emode) {
1198       case CEED_EVAL_NONE:
1199         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1200         break; // No action
1201       case CEED_EVAL_INTERP:
1202         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1203         break;
1204       case CEED_EVAL_GRAD:
1205         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1206         break;
1207       case CEED_EVAL_WEIGHT:
1208         break; // Should not occur
1209       case CEED_EVAL_DIV:
1210         break; // TODO: Not implemented
1211       case CEED_EVAL_CURL:
1212         break; // TODO: Not implemented
1213       }
1214     }
1215   } else {
1216     code << "\n      // Note: No Collocated Gradient\n";
1217     code << "      // -- Input fields --\n";
1218     for (CeedInt i = 0; i < numinputfields; i++) {
1219       code << "      // ---- Input field "<<i<<" ----\n";
1220       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1221     }
1222     code << "      // -- Output fields --\n";
1223     for (CeedInt i = 0; i < numoutputfields; i++) {
1224       code << "      // ---- Output field "<<i<<" ----\n";
1225       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1226     }
1227   }
1228   code << "\n      // -- QFunction Inputs and outputs --\n";
1229   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1230   for (CeedInt i = 0; i < numinputfields; i++) {
1231     code << "      // ---- Input field "<<i<<" ----\n";
1232     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1233   }
1234   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1235   for (CeedInt i = 0; i < numoutputfields; i++) {
1236     code << "      // ---- Output field "<<i<<" ----\n";
1237     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1238   }
1239   code << "\n      // -- Apply QFunction --\n";
1240   code << "      "<<qFunctionName<<"(ctx, ";
1241   if (dim != 3 || useCollograd) {
1242     code << "1";
1243   } else {
1244     code << "Q1d";
1245   }
1246   code << ", in, out);\n";
1247   if (useCollograd) {
1248     code << "\n      // Note: Collocated Gradient\n";
1249     code << "      // -- Output fields --\n";
1250     for (CeedInt i = 0; i < numoutputfields; i++) {
1251       code << "      // ---- Output field "<<i<<" ----\n";
1252       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1253       CeedChkBackend(ierr);
1254       // Basis action
1255       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1256       switch (emode) {
1257       case CEED_EVAL_NONE:
1258         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1259         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1260         code << "      }\n";
1261         break; // No action
1262       case CEED_EVAL_INTERP:
1263         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1264         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1265         code << "      }\n";
1266         break;
1267       case CEED_EVAL_GRAD:
1268         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1269         break;
1270       case CEED_EVAL_WEIGHT:
1271         break; // Should not occur
1272       case CEED_EVAL_DIV:
1273         break; // TODO: Not implemented
1274       case CEED_EVAL_CURL:
1275         break; // TODO: Not implemented
1276       }
1277     }
1278     code << "    }\n";
1279   }
1280 
1281   // Output basis apply if needed
1282   // Generate the correct eval mode code for each output
1283   code << "\n    // -- Output field basis action and restrictions --\n";
1284   for (CeedInt i = 0; i < numoutputfields; i++) {
1285     code << "    // ---- Output field "<<i<<" ----\n";
1286     // Get elemsize, emode, ncomp
1287     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1288     CeedChkBackend(ierr);
1289     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1290     CeedChkBackend(ierr);
1291     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1292     CeedChkBackend(ierr);
1293     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1294     CeedChkBackend(ierr);
1295     // Basis action
1296     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1297     switch (emode) {
1298     case CEED_EVAL_NONE:
1299       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1300       break; // No action
1301     case CEED_EVAL_INTERP:
1302       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1303       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1304       break;
1305     case CEED_EVAL_GRAD:
1306       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1307       if (useCollograd) {
1308         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1309       } else {
1310         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1311       }
1312       break;
1313     // LCOV_EXCL_START
1314     case CEED_EVAL_WEIGHT: {
1315       Ceed ceed;
1316       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1317       return CeedError(ceed, CEED_ERROR_BACKEND,
1318                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1319       break; // Should not occur
1320     }
1321     case CEED_EVAL_DIV:
1322       break; // TODO: Not implemented
1323     case CEED_EVAL_CURL:
1324       break; // TODO: Not implemented
1325       // LCOV_EXCL_STOP
1326     }
1327     // Restriction
1328       bool isStrided;
1329       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
1330     if (!isStrided) {
1331       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1332       CeedChkBackend(ierr);
1333       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1334       CeedInt compstride;
1335       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
1336       code << "    // CompStride: "<<compstride<<"\n";
1337       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
1338       data->indices.out[i] = restr_data->d_ind;
1339       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1340     } else {
1341       bool backendstrides;
1342       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1343       CeedChkBackend(ierr);
1344       CeedInt nelem;
1345       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1346       CeedChkBackend(ierr);
1347       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1348       if (!backendstrides) {
1349         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1350         CeedChkBackend(ierr);
1351       }
1352       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1353       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1354     }
1355   }
1356 
1357   code << "  }\n";
1358   code << "}\n";
1359   code << "// -----------------------------------------------------------------------------\n\n";
1360 
1361   // View kernel for debugging
1362   CeedDebug(code.str().c_str());
1363 
1364   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 1,
1365                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1366   CeedChkBackend(ierr);
1367   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1368   CeedChkBackend(ierr);
1369 
1370   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1371   return CEED_ERROR_SUCCESS;
1372 }
1373 //------------------------------------------------------------------------------
1374