xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 60bb3a481c2d7b36fb54b77a50dd90844aeb9ae2)
1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3 // All Rights reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 #define CEED_DEBUG_COLOR 12
17 
18 #include "ceed-hip-gen.h"
19 #include <iostream>
20 #include <sstream>
21 #include "../hip-shared/ceed-hip-shared.h"
22 #include "../hip/ceed-hip-compile.h"
23 
24 static const char *deviceFunctions = QUOTE(
25 
26 //------------------------------------------------------------------------------
27 // Typedefs
28 //------------------------------------------------------------------------------
29 typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
30 typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
31 
32 typedef struct {
33   CeedInt tidx;
34   CeedInt tidy;
35   CeedInt tidz;
36   CeedInt tid;
37   CeedScalar* slice;
38 } BackendData;
39 
40 //------------------------------------------------------------------------------
41 // Load matrices for basis actions
42 //------------------------------------------------------------------------------
43 template <int P, int Q>
44 inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
45   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
46     B[i] = d_B[i];
47 }
48 
49 //------------------------------------------------------------------------------
50 // 1D
51 //------------------------------------------------------------------------------
52 
53 //------------------------------------------------------------------------------
54 // L-vector -> E-vector, offsets provided
55 //------------------------------------------------------------------------------
56 template <int NCOMP, int COMPSTRIDE, int P1d>
57 inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
58   if (data.tidx < P1d) {
59     const CeedInt node = data.tidx;
60     const CeedInt ind = indices[node + elem * P1d];
61     for (CeedInt comp = 0; comp < NCOMP; ++comp)
62       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
63   }
64 }
65 
66 //------------------------------------------------------------------------------
67 // L-vector -> E-vector, strided
68 //------------------------------------------------------------------------------
69 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
70 inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
71   if (data.tidx < P1d) {
72     const CeedInt node = data.tidx;
73     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
74     for (CeedInt comp = 0; comp < NCOMP; ++comp)
75       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
76   }
77 }
78 
79 //------------------------------------------------------------------------------
80 // E-vector -> L-vector, offsets provided
81 //------------------------------------------------------------------------------
82 template <int NCOMP, int COMPSTRIDE, int P1d>
83 inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
84   if (data.tidx < P1d) {
85     const CeedInt node = data.tidx;
86     const CeedInt ind = indices[node + elem * P1d];
87     for (CeedInt comp = 0; comp < NCOMP; ++comp)
88       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
89   }
90 }
91 
92 //------------------------------------------------------------------------------
93 // E-vector -> L-vector, strided
94 //------------------------------------------------------------------------------
95 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
96 inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
97   if (data.tidx < P1d) {
98     const CeedInt node = data.tidx;
99     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
100     for (CeedInt comp = 0; comp < NCOMP; ++comp)
101       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
102   }
103 }
104 
105 //------------------------------------------------------------------------------
106 // 1D tensor contraction x
107 //------------------------------------------------------------------------------
108 template <int NCOMP, int P1d, int Q1d>
109 inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
110   data.slice[data.tidx] = *U;
111   __syncthreads();
112   *V = 0.0;
113   if (data.tidx < Q1d)
114     for (CeedInt i = 0; i < P1d; ++i)
115       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
116   __syncthreads();
117 }
118 
119 //------------------------------------------------------------------------------
120 // 1D transpose tensor contraction x
121 //------------------------------------------------------------------------------
122 template <int NCOMP, int P1d, int Q1d>
123 inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
124   data.slice[data.tidx] = *U;
125   __syncthreads();
126   *V = 0.0;
127   if (data.tidx < P1d)
128     for (CeedInt i = 0; i < Q1d; ++i)
129       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
130   __syncthreads();
131 }
132 
133 //------------------------------------------------------------------------------
134 // 1D interpolate to quadrature points
135 //------------------------------------------------------------------------------
136 template <int NCOMP, int P1d, int Q1d>
137 inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
138   for (CeedInt comp = 0; comp < NCOMP; comp++)
139     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
140 }
141 
142 //------------------------------------------------------------------------------
143 // 1D interpolate transpose
144 //------------------------------------------------------------------------------
145 template <int NCOMP, int P1d, int Q1d>
146 inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
147   for (CeedInt comp=0; comp<NCOMP; comp++)
148     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
149 }
150 
151 //------------------------------------------------------------------------------
152 // 1D derivatives at quadrature points
153 //------------------------------------------------------------------------------
154 template <int NCOMP, int P1d, int Q1d>
155 inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
156   for (CeedInt comp = 0; comp < NCOMP; comp++)
157     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
158 }
159 
160 //------------------------------------------------------------------------------
161 // 1D derivatives transpose
162 //------------------------------------------------------------------------------
163 template <int NCOMP, int P1d, int Q1d>
164 inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
165   for (CeedInt comp = 0; comp < NCOMP; comp++)
166     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
167 }
168 
169 //------------------------------------------------------------------------------
170 // 2D
171 //------------------------------------------------------------------------------
172 
173 //------------------------------------------------------------------------------
174 // L-vector -> E-vector, offsets provided
175 //------------------------------------------------------------------------------
176 template <int NCOMP, int COMPSTRIDE, int P1d>
177 inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
178   if (data.tidx < P1d && data.tidy < P1d) {
179     const CeedInt node = data.tidx + data.tidy*P1d;
180     const CeedInt ind = indices[node + elem * P1d*P1d];
181     for (CeedInt comp = 0; comp < NCOMP; ++comp)
182       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
183   }
184 }
185 
186 //------------------------------------------------------------------------------
187 // L-vector -> E-vector, strided
188 //------------------------------------------------------------------------------
189 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
190 inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
191   if (data.tidx < P1d && data.tidy < P1d) {
192     const CeedInt node = data.tidx + data.tidy*P1d;
193     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
194     for (CeedInt comp = 0; comp < NCOMP; ++comp)
195       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
196   }
197 }
198 
199 //------------------------------------------------------------------------------
200 // E-vector -> L-vector, offsets provided
201 //------------------------------------------------------------------------------
202 template <int NCOMP, int COMPSTRIDE, int P1d>
203 inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
204   if (data.tidx < P1d && data.tidy < P1d) {
205     const CeedInt node = data.tidx + data.tidy*P1d;
206     const CeedInt ind = indices[node + elem * P1d*P1d];
207     for (CeedInt comp = 0; comp < NCOMP; ++comp)
208       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
209   }
210 }
211 
212 //------------------------------------------------------------------------------
213 // E-vector -> L-vector, strided
214 //------------------------------------------------------------------------------
215 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
216 inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
217   if (data.tidx < P1d && data.tidy < P1d) {
218     const CeedInt node = data.tidx + data.tidy*P1d;
219     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
220     for (CeedInt comp = 0; comp < NCOMP; ++comp)
221       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
222   }
223 }
224 
225 //------------------------------------------------------------------------------
226 // 2D tensor contraction x
227 //------------------------------------------------------------------------------
228 template <int NCOMP, int P1d, int Q1d>
229 inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
230   data.slice[data.tidx+data.tidy*T1d] = *U;
231   __syncthreads();
232   *V = 0.0;
233   if (data.tidx < Q1d && data.tidy < P1d)
234     for (CeedInt i = 0; i < P1d; ++i)
235       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
236   __syncthreads();
237 }
238 
239 //------------------------------------------------------------------------------
240 // 2D tensor contract y
241 //------------------------------------------------------------------------------
242 template <int NCOMP, int P1d, int Q1d>
243 inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
244   data.slice[data.tidx+data.tidy*T1d] = *U;
245   __syncthreads();
246   *V = 0.0;
247   if (data.tidx < Q1d && data.tidy < Q1d)
248     for (CeedInt i = 0; i < P1d; ++i)
249       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
250   __syncthreads();
251 }
252 
253 //------------------------------------------------------------------------------
254 // 2D transpose tensor contract y
255 //------------------------------------------------------------------------------
256 template <int NCOMP, int P1d, int Q1d>
257 inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
258   data.slice[data.tidx+data.tidy*T1d] = *U;
259   __syncthreads();
260   *V = 0.0;
261   if (data.tidx < Q1d && data.tidy < P1d)
262     for (CeedInt i = 0; i < Q1d; ++i)
263       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
264   __syncthreads();
265 }
266 
267 //------------------------------------------------------------------------------
268 // 2D transpose tensor contract x
269 //------------------------------------------------------------------------------
270 template <int NCOMP, int P1d, int Q1d>
271 inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
272   data.slice[data.tidx+data.tidy*T1d] = *U;
273   __syncthreads();
274   *V = 0.0;
275   if (data.tidx < P1d && data.tidy < P1d)
276     for (CeedInt i = 0; i < Q1d; ++i)
277       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
278   __syncthreads();
279 }
280 
281 //------------------------------------------------------------------------------
282 // 2D transpose tensor contract and add x
283 //------------------------------------------------------------------------------
284 template <int NCOMP, int P1d, int Q1d>
285 inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
286   data.slice[data.tidx+data.tidy*T1d] = *U;
287   __syncthreads();
288   if (data.tidx < P1d && data.tidy < P1d)
289     for (CeedInt i = 0; i < Q1d; ++i)
290       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
291   __syncthreads();
292 }
293 
294 //------------------------------------------------------------------------------
295 // 2D interpolate to quadrature points
296 //------------------------------------------------------------------------------
297 template <int NCOMP, int P1d, int Q1d>
298 inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
299   CeedScalar r_t[1];
300   for (CeedInt comp = 0; comp < NCOMP; comp++) {
301     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
302     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
303   }
304 }
305 
306 //------------------------------------------------------------------------------
307 // 2D interpolate transpose
308 //------------------------------------------------------------------------------
309 template <int NCOMP, int P1d, int Q1d>
310 inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
311   CeedScalar r_t[1];
312   for (CeedInt comp = 0; comp < NCOMP; comp++) {
313     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
314     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
315   }
316 }
317 
318 //------------------------------------------------------------------------------
319 // 2D derivatives at quadrature points
320 //------------------------------------------------------------------------------
321 template <int NCOMP, int P1d, int Q1d>
322 inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
323   CeedScalar r_t[1];
324   for (CeedInt comp = 0; comp < NCOMP; comp++) {
325     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
326     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
327     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
328     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
329   }
330 }
331 
332 //------------------------------------------------------------------------------
333 // 2D derivatives transpose
334 //------------------------------------------------------------------------------
335 template <int NCOMP, int P1d, int Q1d>
336 inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
337   CeedScalar r_t[1];
338   for (CeedInt comp = 0; comp < NCOMP; comp++) {
339     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
340     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
341     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
342     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
343   }
344 }
345 
346 //------------------------------------------------------------------------------
347 // 3D
348 //------------------------------------------------------------------------------
349 
350 //------------------------------------------------------------------------------
351 // L-vector -> E-vector, offsets provided
352 //------------------------------------------------------------------------------
353 template <int NCOMP, int COMPSTRIDE, int P1d>
354 inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
355   if (data.tidx < P1d && data.tidy < P1d)
356     for (CeedInt z = 0; z < P1d; ++z) {
357       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
358       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
359       for (CeedInt comp = 0; comp < NCOMP; ++comp)
360         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
361     }
362 }
363 
364 //------------------------------------------------------------------------------
365 // L-vector -> E-vector, strided
366 //------------------------------------------------------------------------------
367 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
368 inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
369   if (data.tidx < P1d && data.tidy < P1d)
370     for (CeedInt z = 0; z < P1d; ++z) {
371       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
372       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
373       for (CeedInt comp = 0; comp < NCOMP; ++comp)
374         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
375     }
376 }
377 
378 //------------------------------------------------------------------------------
379 // E-vector -> Q-vector, offests provided
380 //------------------------------------------------------------------------------
381 template <int NCOMP, int COMPSTRIDE, int Q1d>
382 inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
383   if (data.tidx < Q1d && data.tidy < Q1d) {
384     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
385     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
386     for (CeedInt comp = 0; comp < NCOMP; ++comp)
387       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
388   }
389 }
390 
391 //------------------------------------------------------------------------------
392 // E-vector -> Q-vector, strided
393 //------------------------------------------------------------------------------
394 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
395 inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
396   if (data.tidx < Q1d && data.tidy < Q1d) {
397     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
398     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
399     for (CeedInt comp = 0; comp < NCOMP; ++comp)
400       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
401   }
402 }
403 
404 //------------------------------------------------------------------------------
405 // E-vector -> L-vector, offsets provided
406 //------------------------------------------------------------------------------
407 template <int NCOMP, int COMPSTRIDE, int P1d>
408 inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
409   if (data.tidx < P1d && data.tidy < P1d)
410     for (CeedInt z = 0; z < P1d; ++z) {
411       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
412       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
413       for (CeedInt comp = 0; comp < NCOMP; ++comp)
414         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
415     }
416 }
417 
418 //------------------------------------------------------------------------------
419 // E-vector -> L-vector, strided
420 //------------------------------------------------------------------------------
421 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
422 inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
423   if (data.tidx < P1d && data.tidy < P1d)
424     for (CeedInt z = 0; z < P1d; ++z) {
425       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
426       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
427       for (CeedInt comp = 0; comp < NCOMP; ++comp)
428         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
429     }
430 }
431 
432 //------------------------------------------------------------------------------
433 // 3D tensor contract x
434 //------------------------------------------------------------------------------
435 template <int NCOMP, int P1d, int Q1d>
436 inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
437   CeedScalar r_B[P1d];
438   for (CeedInt i = 0; i < P1d; ++i)
439     r_B[i] = B[i + data.tidx*P1d];
440 
441   for (CeedInt k = 0; k < P1d; ++k) {
442     data.slice[data.tidx+data.tidy*T1d] = U[k];
443     __syncthreads();
444     V[k] = 0.0;
445     if (data.tidx < Q1d && data.tidy < P1d)
446       for (CeedInt i = 0; i < P1d; ++i)
447         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
448     __syncthreads();
449   }
450 }
451 
452 //------------------------------------------------------------------------------
453 // 3D tensor contract y
454 //------------------------------------------------------------------------------
455 template <int NCOMP, int P1d, int Q1d>
456 inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
457   CeedScalar r_B[P1d];
458   for (CeedInt i = 0; i < P1d; ++i)
459     r_B[i] = B[i + data.tidy*P1d];
460 
461   for (CeedInt k = 0; k < P1d; ++k) {
462     data.slice[data.tidx+data.tidy*T1d] = U[k];
463     __syncthreads();
464     V[k] = 0.0;
465     if (data.tidx < Q1d && data.tidy < Q1d)
466       for (CeedInt i = 0; i < P1d; ++i)
467         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
468     __syncthreads();
469   }
470 }
471 
472 //------------------------------------------------------------------------------
473 // 3D tensor contract z
474 //------------------------------------------------------------------------------
475 template <int NCOMP, int P1d, int Q1d>
476 inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
477   for (CeedInt k = 0; k < Q1d; ++k) {
478     V[k] = 0.0;
479     if (data.tidx < Q1d && data.tidy < Q1d)
480       for (CeedInt i = 0; i < P1d; ++i)
481         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
482   }
483 }
484 
485 //------------------------------------------------------------------------------
486 // 3D transpose tensor contract z
487 //------------------------------------------------------------------------------
488 template <int NCOMP, int P1d, int Q1d>
489 inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
490   for (CeedInt k = 0; k < P1d; ++k) {
491     V[k] = 0.0;
492     if (data.tidx < Q1d && data.tidy < Q1d)
493       for (CeedInt i = 0; i < Q1d; ++i)
494         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
495   }
496 }
497 
498 //------------------------------------------------------------------------------
499 // 3D transpose tensor contract y
500 //------------------------------------------------------------------------------
501 template <int NCOMP, int P1d, int Q1d>
502 inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
503   CeedScalar r_B[Q1d];
504   for (CeedInt i = 0; i < Q1d; ++i)
505     r_B[i] = B[data.tidy + i*P1d];
506 
507   for (CeedInt k = 0; k < P1d; ++k) {
508     data.slice[data.tidx+data.tidy*T1d] = U[k];
509     __syncthreads();
510     V[k] = 0.0;
511     if (data.tidx < Q1d && data.tidy < P1d)
512       for (CeedInt i = 0; i < Q1d; ++i)
513         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
514     __syncthreads();
515   }
516 }
517 
518 //------------------------------------------------------------------------------
519 // 3D transpose tensor contract add y
520 //------------------------------------------------------------------------------
521 template <int NCOMP, int P1d, int Q1d>
522 inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
523   CeedScalar r_B[Q1d];
524   for (CeedInt i = 0; i < Q1d; ++i)
525     r_B[i] = B[data.tidy + i*P1d];
526 
527   for (CeedInt k = 0; k < P1d; ++k) {
528     data.slice[data.tidx+data.tidy*T1d] = U[k];
529     __syncthreads();
530     if (data.tidx < Q1d && data.tidy < P1d)
531       for (CeedInt i = 0; i < Q1d; ++i)
532         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
533     __syncthreads();
534   }
535 }
536 
537 //------------------------------------------------------------------------------
538 // 3D transpose tensor contract x
539 //------------------------------------------------------------------------------
540 template <int NCOMP, int P1d, int Q1d>
541 inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
542   CeedScalar r_B[Q1d];
543   for (CeedInt i = 0; i < Q1d; ++i)
544     r_B[i] = B[data.tidx + i*P1d];
545 
546   for (CeedInt k = 0; k < P1d; ++k) {
547     data.slice[data.tidx+data.tidy*T1d] = U[k];
548     __syncthreads();
549     V[k] = 0.0;
550     if (data.tidx < P1d && data.tidy < P1d)
551       for (CeedInt i = 0; i < Q1d; ++i)
552         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
553     __syncthreads();
554   }
555 }
556 
557 //------------------------------------------------------------------------------
558 // 3D transpose tensor contract add x
559 //------------------------------------------------------------------------------
560 template <int NCOMP, int P1d, int Q1d>
561 inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
562   CeedScalar r_B[Q1d];
563   for (CeedInt i = 0; i < Q1d; ++i)
564     r_B[i] = B[data.tidx + i*P1d];
565 
566   for (CeedInt k = 0; k < P1d; ++k) {
567     data.slice[data.tidx+data.tidy*T1d] = U[k];
568     __syncthreads();
569     if (data.tidx < P1d && data.tidy < P1d)
570       for (CeedInt i = 0; i < Q1d; ++i)
571         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
572     __syncthreads();
573   }
574 }
575 
576 //------------------------------------------------------------------------------
577 // 3D interpolate to quadrature points
578 //------------------------------------------------------------------------------
579 template <int NCOMP, int P1d, int Q1d>
580 inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
581   CeedScalar r_t1[T1d];
582   CeedScalar r_t2[T1d];
583   for (CeedInt comp = 0; comp < NCOMP; comp++) {
584     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
585     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
586     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
587   }
588 }
589 
590 //------------------------------------------------------------------------------
591 // 3D interpolate transpose
592 //------------------------------------------------------------------------------
593 template <int NCOMP, int P1d, int Q1d>
594 inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
595   CeedScalar r_t1[T1d];
596   CeedScalar r_t2[T1d];
597   for (CeedInt comp = 0; comp < NCOMP; comp++) {
598     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
599     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
600     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
601   }
602 }
603 
604 //------------------------------------------------------------------------------
605 // 3D derivatives at quadrature points
606 //------------------------------------------------------------------------------
607 template <int NCOMP, int P1d, int Q1d>
608 inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
609   CeedScalar r_t1[T1d];
610   CeedScalar r_t2[T1d];
611   for (CeedInt comp = 0; comp < NCOMP; comp++) {
612     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
613     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
614     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
615     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
616     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
617     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
618     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
619     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
620     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
621   }
622 }
623 
624 //------------------------------------------------------------------------------
625 // 3D derivatives transpose
626 //------------------------------------------------------------------------------
627 template <int NCOMP, int P1d, int Q1d>
628 inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
629   CeedScalar r_t1[T1d];
630   CeedScalar r_t2[T1d];
631   for (CeedInt comp = 0; comp < NCOMP; comp++) {
632     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
633     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
634     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
635     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
636     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
637     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
638     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
639     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
640     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
641   }
642 }
643 
644 //------------------------------------------------------------------------------
645 // 3D collocated derivatives computation
646 //------------------------------------------------------------------------------
647 template <int NCOMP, int Q1d>
648 inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
649   if (data.tidx < Q1d && data.tidy < Q1d) {
650     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
651       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
652       __syncthreads();
653       // X derivative
654       r_V[comp+0*NCOMP] = 0.0;
655       for (CeedInt i = 0; i < Q1d; ++i)
656         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
657       // Y derivative
658       r_V[comp+1*NCOMP] = 0.0;
659       for (CeedInt i = 0; i < Q1d; ++i)
660         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
661       // Z derivative
662       r_V[comp+2*NCOMP] = 0.0;
663       for (CeedInt i = 0; i < Q1d; ++i)
664         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
665       __syncthreads();
666     }
667   }
668 }
669 
670 //------------------------------------------------------------------------------
671 // 3D collocated derivatives transpose
672 //------------------------------------------------------------------------------
673 template <int NCOMP, int Q1d>
674 inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
675   if (data.tidx < Q1d && data.tidy < Q1d) {
676     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
677       // X derivative
678       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
679       __syncthreads();
680       for (CeedInt i = 0; i < Q1d; ++i)
681         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
682       __syncthreads();
683       // Y derivative
684       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
685       __syncthreads();
686       for (CeedInt i = 0; i < Q1d; ++i)
687         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
688       __syncthreads();
689       // Z derivative
690       for (CeedInt i = 0; i < Q1d; ++i)
691         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
692     }
693   }
694 }
695 
696 //------------------------------------------------------------------------------
697 // 1D quadrature weights
698 //------------------------------------------------------------------------------
699 template <int Q1d>
700 inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
701   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
702 }
703 
704 //------------------------------------------------------------------------------
705 // 2D quadrature weights
706 //------------------------------------------------------------------------------
707 template <int Q1d>
708 inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
709   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
710         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
711 }
712 
713 //------------------------------------------------------------------------------
714 // 3D quadrature weights
715 //------------------------------------------------------------------------------
716 template <int Q1d>
717 inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
718   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
719   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
720   for (CeedInt z = 0; z < Q1d; ++z)
721     w[z] = quad ? pw*qweight1d[z] : 0.0;
722 }
723 
724 );
725 //------------------------------------------------------------------------------
726 // Build singe operator kernel
727 //------------------------------------------------------------------------------
728 CEED_INTERN int CeedHipGenOperatorBuild(CeedOperator op) {
729 
730   using std::ostringstream;
731   using std::string;
732   int ierr;
733   bool setupdone;
734   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
735   if (setupdone) return 0;
736   Ceed ceed;
737   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
738   CeedOperator_Hip_gen *data;
739   ierr = CeedOperatorGetData(op, &data); CeedChk(ierr);
740   CeedQFunction qf;
741   CeedQFunction_Hip_gen *qf_data;
742   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
743   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr);
744   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
745           numoutputfields, ncomp, dim = 0, lsize;
746   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
747   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
748   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
749   CeedChk(ierr);
750   CeedOperatorField *opinputfields, *opoutputfields;
751   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
752   CeedChk(ierr);
753   CeedQFunctionField *qfinputfields, *qfoutputfields;
754   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
755   CeedChk(ierr);
756   CeedEvalMode emode;
757   CeedBasis basis;
758   CeedBasis_Hip_shared *basis_data;
759   CeedElemRestriction Erestrict;
760   CeedElemRestriction_Hip *restr_data;
761 
762   ostringstream code;
763   string devFunctions(deviceFunctions);
764   code << devFunctions;
765 
766   string qFunction(qf_data->qFunctionSource);
767   string qFunctionName(qf_data->qFunctionName);
768   string oper;
769   oper = "CeedKernel_Hip_gen_" + qFunctionName;
770 
771   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
772   code << "\n#define CeedPragmaSIMD\n";
773 
774   // Find dim and Q1d
775   bool useCollograd = true;
776   data->maxP1d = 0;
777   for (CeedInt i = 0; i < numinputfields; i++) {
778     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
779     if (basis != CEED_BASIS_COLLOCATED) {
780       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
781       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
782       CeedChk(ierr);
783 
784       // Check for collocated gradient
785       useCollograd = useCollograd && basis_data->d_collograd1d;
786 
787       // Collect dim and Q1d
788       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
789       bool isTensor;
790       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
791       if (isTensor) {
792         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
793         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
794         if (P1d>data->maxP1d) data->maxP1d = P1d;
795       } else {
796         // LCOV_EXCL_START
797         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
798         // LCOV_EXCL_STOP
799         }
800     }
801   }
802   // Check output bases for Q1d, dim as well
803   //   The only imput basis might be CEED_BASIS_COLLOCATED
804   for (CeedInt i = 0; i < numoutputfields; i++) {
805     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
806 
807     if (basis != CEED_BASIS_COLLOCATED) {
808       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
809       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
810       CeedChk(ierr);
811 
812       // Collect dim and Q1d
813       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
814       bool isTensor;
815       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
816       if (isTensor) {
817         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
818       } else {
819         // LCOV_EXCL_START
820         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
821         // LCOV_EXCL_STOP
822         }
823 
824       // Check for collocated gradient
825       useCollograd = useCollograd && basis_data->d_collograd1d;
826     }
827   }
828   data->dim = dim;
829   data->Q1d = Q1d;
830 
831   // Define CEED_Q_VLA
832   if (dim != 3 || useCollograd) {
833     code << "\n#define CEED_Q_VLA 1\n\n";
834   } else {
835     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
836   }
837 
838   code << qFunction;
839 
840   // Setup
841   code << "\n// -----------------------------------------------------------------------------\n";
842   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
843   for (CeedInt i = 0; i < numinputfields; i++) {
844     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
845     CeedChk(ierr);
846     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
847       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
848     }
849   }
850 
851   for (CeedInt i = 0; i < numoutputfields; i++) {
852     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
853   }
854 
855   code << "  const CeedInt Dim = "<<dim<<";\n";
856   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
857 
858   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
859   code << "  BackendData data;\n";
860   code << "  data.tidx = threadIdx.x;\n";
861   code << "  data.tidy = threadIdx.y;\n";
862   code << "  data.tidz = threadIdx.z;\n";
863   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
864   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
865 
866   code << "\n  // -- Input field constants and basis data --\n";
867   //Initialize constants, and matrices B and G
868   for (CeedInt i = 0; i < numinputfields; i++) {
869     code << "  // ---- Input field "<<i<<" ----\n";
870     // Get elemsize, emode, ncomp
871     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
872     CeedChk(ierr);
873     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
874     CeedChk(ierr);
875     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
876     CeedChk(ierr);
877     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
878     CeedChk(ierr);
879 
880     // Set field constants
881     if (emode != CEED_EVAL_WEIGHT) {
882       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
883       if (basis != CEED_BASIS_COLLOCATED) {
884         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
885         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
886       } else {
887         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
888       }
889       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
890     }
891 
892     // Load basis data
893     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
894     switch (emode) {
895     case CEED_EVAL_NONE:
896       break;
897     case CEED_EVAL_INTERP:
898       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
899       data->B.in[i] = basis_data->d_interp1d;
900       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
901       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
902       break;
903     case CEED_EVAL_GRAD:
904       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
905       data->B.in[i] = basis_data->d_interp1d;
906       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
907       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
908       if (useCollograd) {
909         data->G.in[i] = basis_data->d_collograd1d;
910         code << "  __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
911         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
912       } else {
913         data->G.in[i] = basis_data->d_grad1d;
914         code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
915         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
916       }
917       break;
918     case CEED_EVAL_WEIGHT:
919       break; // No action
920     case CEED_EVAL_DIV:
921       break; // TODO: Not implemented
922     case CEED_EVAL_CURL:
923       break; // TODO: Not implemented
924     }
925   }
926 
927   code << "\n  // -- Output field constants and basis data --\n";
928   for (CeedInt i = 0; i < numoutputfields; i++) {
929     code << "  // ---- Output field "<<i<<" ----\n";
930     // Get elemsize, emode, ncomp
931     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
932     CeedChk(ierr);
933     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
934     CeedChk(ierr);
935     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
936     CeedChk(ierr);
937     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
938     CeedChk(ierr);
939 
940     // Set field constants
941     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
942     if (basis != CEED_BASIS_COLLOCATED) {
943       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
944       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
945     } else {
946       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
947     }
948     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
949 
950     // Load basis data
951     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
952     switch (emode) {
953     case CEED_EVAL_NONE:
954       break; // No action
955     case CEED_EVAL_INTERP:
956       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
957       data->B.out[i] = basis_data->d_interp1d;
958       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
959       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
960       break;
961     case CEED_EVAL_GRAD:
962       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
963       data->B.out[i] = basis_data->d_interp1d;
964       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
965       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
966       if (useCollograd) {
967         data->G.out[i] = basis_data->d_collograd1d;
968         code << "  __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
969         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
970       } else {
971         data->G.out[i] = basis_data->d_grad1d;
972         code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
973         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
974       }
975       break;
976     // LCOV_EXCL_START
977     case CEED_EVAL_WEIGHT: {
978       Ceed ceed;
979       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
980       return CeedError(ceed, 1,
981                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
982       break; // Should not occur
983     }
984     case CEED_EVAL_DIV:
985       break; // TODO: Not implemented
986     case CEED_EVAL_CURL:
987       break; // TODO: Not implemented
988       // LCOV_EXCL_STOP
989     }
990   }
991   code << "\n  // -- Element loop --\n";
992   code << "  __syncthreads();\n";
993   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
994   // Input basis apply if needed
995   // Generate the correct eval mode code for each input
996   code << "    // -- Input field restrictions and basis actions --\n";
997   for (CeedInt i = 0; i < numinputfields; i++) {
998     code << "    // ---- Input field "<<i<<" ----\n";
999     // Get elemsize, emode, ncomp
1000     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1001     CeedChk(ierr);
1002     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1003     CeedChk(ierr);
1004     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1005     CeedChk(ierr);
1006     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1007     CeedChk(ierr);
1008 
1009     // Restriction
1010     if (emode != CEED_EVAL_WEIGHT &&
1011         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1012       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1013 
1014       bool isStrided;
1015       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1016       if (!isStrided) {
1017         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1018         CeedChk(ierr);
1019         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1020         CeedInt compstride;
1021         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1022         code << "    // CompStride: "<<compstride<<"\n";
1023         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1024         data->indices.in[i] = restr_data->d_ind;
1025         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1026       } else {
1027         bool backendstrides;
1028         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1029         CeedChk(ierr);
1030         CeedInt nelem;
1031         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1032         CeedChk(ierr);
1033         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1034         if (!backendstrides) {
1035           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1036           CeedChk(ierr);
1037         }
1038         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1039         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1040       }
1041     }
1042 
1043     // Basis action
1044     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1045     switch (emode) {
1046     case CEED_EVAL_NONE:
1047       if (!useCollograd) {
1048         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1049       }
1050       break;
1051     case CEED_EVAL_INTERP:
1052       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1053       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1054       break;
1055     case CEED_EVAL_GRAD:
1056       if (useCollograd) {
1057         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1058         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1059       } else {
1060         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1061         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1062       }
1063       break;
1064     case CEED_EVAL_WEIGHT:
1065       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1066       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1067       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
1068       data->W = basis_data->d_qweight1d;
1069       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1070       break; // No action
1071     case CEED_EVAL_DIV:
1072       break; // TODO: Not implemented
1073     case CEED_EVAL_CURL:
1074       break; // TODO: Not implemented
1075     }
1076   }
1077 
1078   // Q function
1079   code << "\n    // -- Output field setup --\n";
1080   for (CeedInt i = 0; i < numoutputfields; i++) {
1081       code << "\n    // ---- Output field "<<i<<" ----\n";
1082     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1083     CeedChk(ierr);
1084     if (emode==CEED_EVAL_GRAD)
1085     {
1086       if (useCollograd) {
1087         //Accumulator for gradient slices
1088         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1089         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1090         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1091         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1092         code << "      }\n";
1093         code << "    }\n";
1094       } else {
1095         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1096       }
1097     }
1098     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1099     {
1100       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1101     }
1102   }
1103   // We treat quadrature points per slice in 3d to save registers
1104   if (useCollograd) {
1105     code << "\n    // Note: Collocated Gradient\n";
1106     code << "#pragma unroll\n";
1107     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1108     code << "      // -- Input fields --\n";
1109     for (CeedInt i = 0; i < numinputfields; i++) {
1110       code << "      // ---- Input field "<<i<<" ----\n";
1111       // Get elemsize, emode, ncomp
1112       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1113       CeedChk(ierr);
1114       // Basis action
1115       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1116       switch (emode) {
1117       case CEED_EVAL_NONE:
1118         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1119 
1120         bool isStrided;
1121         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr);
1122         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr);
1123         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1124         if (!isStrided) {
1125           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1126           CeedChk(ierr);
1127           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1128           CeedInt compstride;
1129           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1130           code << "      // CompStride: "<<compstride<<"\n";
1131           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1132           data->indices.in[i] = restr_data->d_ind;
1133           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1134         } else {
1135           bool backendstrides;
1136           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1137           CeedChk(ierr);
1138           CeedInt nelem;
1139           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1140           CeedChk(ierr);
1141           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1142           if (!backendstrides) {
1143             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1144             CeedChk(ierr);
1145           }
1146           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1147           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1148         }
1149         break;
1150       case CEED_EVAL_INTERP:
1151         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1152         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1153         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1154         code << "      }\n";
1155         break;
1156       case CEED_EVAL_GRAD:
1157         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1158         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1159         break;
1160       case CEED_EVAL_WEIGHT:
1161         code << "      CeedScalar r_q"<<i<<"[1];\n";
1162         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1163         break; // No action
1164       case CEED_EVAL_DIV:
1165         break; // TODO: Not implemented
1166       case CEED_EVAL_CURL:
1167         break; // TODO: Not implemented
1168       }
1169     }
1170     code << "\n      // -- Output fields --\n";
1171     for (CeedInt i = 0; i < numoutputfields; i++) {
1172       code << "      // ---- Output field "<<i<<" ----\n";
1173       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1174       CeedChk(ierr);
1175       // Basis action
1176       switch (emode) {
1177       case CEED_EVAL_NONE:
1178         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1179         break; // No action
1180       case CEED_EVAL_INTERP:
1181         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1182         break;
1183       case CEED_EVAL_GRAD:
1184         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1185         break;
1186       case CEED_EVAL_WEIGHT:
1187         break; // Should not occur
1188       case CEED_EVAL_DIV:
1189         break; // TODO: Not implemented
1190       case CEED_EVAL_CURL:
1191         break; // TODO: Not implemented
1192       }
1193     }
1194   } else {
1195     code << "\n      // Note: No Collocated Gradient\n";
1196     code << "      // -- Input fields --\n";
1197     for (CeedInt i = 0; i < numinputfields; i++) {
1198       code << "      // ---- Input field "<<i<<" ----\n";
1199       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1200     }
1201     code << "      // -- Output fields --\n";
1202     for (CeedInt i = 0; i < numoutputfields; i++) {
1203       code << "      // ---- Output field "<<i<<" ----\n";
1204       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1205     }
1206   }
1207   code << "\n      // -- QFunction Inputs and outputs --\n";
1208   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1209   for (CeedInt i = 0; i < numinputfields; i++) {
1210     code << "      // ---- Input field "<<i<<" ----\n";
1211     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1212   }
1213   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1214   for (CeedInt i = 0; i < numoutputfields; i++) {
1215     code << "      // ---- Output field "<<i<<" ----\n";
1216     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1217   }
1218   code << "\n      // -- Apply QFunction --\n";
1219   code << "      "<<qFunctionName<<"(ctx, ";
1220   if (dim != 3 || useCollograd) {
1221     code << "1";
1222   } else {
1223     code << "Q1d";
1224   }
1225   code << ", in, out);\n";
1226   if (useCollograd) {
1227     code << "\n      // Note: Collocated Gradient\n";
1228     code << "      // -- Output fields --\n";
1229     for (CeedInt i = 0; i < numoutputfields; i++) {
1230       code << "      // ---- Output field "<<i<<" ----\n";
1231       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1232       CeedChk(ierr);
1233       // Basis action
1234       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1235       switch (emode) {
1236       case CEED_EVAL_NONE:
1237         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1238         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1239         code << "      }\n";
1240         break; // No action
1241       case CEED_EVAL_INTERP:
1242         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1243         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1244         code << "      }\n";
1245         break;
1246       case CEED_EVAL_GRAD:
1247         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1248         break;
1249       case CEED_EVAL_WEIGHT:
1250         break; // Should not occur
1251       case CEED_EVAL_DIV:
1252         break; // TODO: Not implemented
1253       case CEED_EVAL_CURL:
1254         break; // TODO: Not implemented
1255       }
1256     }
1257     code << "    }\n";
1258   }
1259 
1260   // Output basis apply if needed
1261   // Generate the correct eval mode code for each output
1262   code << "\n    // -- Output field basis action and restrictions --\n";
1263   for (CeedInt i = 0; i < numoutputfields; i++) {
1264     code << "    // ---- Output field "<<i<<" ----\n";
1265     // Get elemsize, emode, ncomp
1266     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1267     CeedChk(ierr);
1268     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1269     CeedChk(ierr);
1270     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1271     CeedChk(ierr);
1272     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1273     CeedChk(ierr);
1274     // Basis action
1275     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1276     switch (emode) {
1277     case CEED_EVAL_NONE:
1278       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1279       break; // No action
1280     case CEED_EVAL_INTERP:
1281       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1282       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1283       break;
1284     case CEED_EVAL_GRAD:
1285       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1286       if (useCollograd) {
1287         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1288       } else {
1289         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1290       }
1291       break;
1292     // LCOV_EXCL_START
1293     case CEED_EVAL_WEIGHT: {
1294       Ceed ceed;
1295       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1296       return CeedError(ceed, 1,
1297                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1298       break; // Should not occur
1299     }
1300     case CEED_EVAL_DIV:
1301       break; // TODO: Not implemented
1302     case CEED_EVAL_CURL:
1303       break; // TODO: Not implemented
1304       // LCOV_EXCL_STOP
1305     }
1306     // Restriction
1307       bool isStrided;
1308       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1309     if (!isStrided) {
1310       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1311       CeedChk(ierr);
1312       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1313       CeedInt compstride;
1314       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1315       code << "    // CompStride: "<<compstride<<"\n";
1316       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1317       data->indices.out[i] = restr_data->d_ind;
1318       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1319     } else {
1320       bool backendstrides;
1321       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1322       CeedChk(ierr);
1323       CeedInt nelem;
1324       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1325       CeedChk(ierr);
1326       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1327       if (!backendstrides) {
1328         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1329         CeedChk(ierr);
1330       }
1331       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1332       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1333     }
1334   }
1335 
1336   code << "  }\n";
1337   code << "}\n";
1338   code << "// -----------------------------------------------------------------------------\n\n";
1339 
1340   // View kernel for debugging
1341   CeedDebug(code.str().c_str());
1342 
1343   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 1,
1344                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1345   CeedChk(ierr);
1346   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1347   CeedChk(ierr);
1348 
1349   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1350   return 0;
1351 }
1352 //------------------------------------------------------------------------------
1353