xref: /libCEED/rust/libceed-sys/c-src/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 7d8d0e25636a94a27ff75b3dec09737e24cdb0fe)
1*7d8d0e25Snbeams // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
2*7d8d0e25Snbeams // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
3*7d8d0e25Snbeams // All Rights reserved. See files LICENSE and NOTICE for details.
4*7d8d0e25Snbeams //
5*7d8d0e25Snbeams // This file is part of CEED, a collection of benchmarks, miniapps, software
6*7d8d0e25Snbeams // libraries and APIs for efficient high-order finite element and spectral
7*7d8d0e25Snbeams // element discretizations for exascale applications. For more information and
8*7d8d0e25Snbeams // source code availability see http://github.com/ceed.
9*7d8d0e25Snbeams //
10*7d8d0e25Snbeams // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11*7d8d0e25Snbeams // a collaborative effort of two U.S. Department of Energy organizations (Office
12*7d8d0e25Snbeams // of Science and the National Nuclear Security Administration) responsible for
13*7d8d0e25Snbeams // the planning and preparation of a capable exascale ecosystem, including
14*7d8d0e25Snbeams // software, applications, hardware, advanced system engineering and early
15*7d8d0e25Snbeams // testbed platforms, in support of the nation's exascale computing imperative.
16*7d8d0e25Snbeams #define CEED_DEBUG_COLOR 12
17*7d8d0e25Snbeams 
18*7d8d0e25Snbeams #include "ceed-hip-gen.h"
19*7d8d0e25Snbeams #include <iostream>
20*7d8d0e25Snbeams #include <sstream>
21*7d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h"
22*7d8d0e25Snbeams #include "../hip/ceed-hip-compile.h"
23*7d8d0e25Snbeams 
24*7d8d0e25Snbeams static const char *atomicAdd = QUOTE(
25*7d8d0e25Snbeams //------------------------------------------------------------------------------
26*7d8d0e25Snbeams // Atomic add, for older CUDA
27*7d8d0e25Snbeams //------------------------------------------------------------------------------
28*7d8d0e25Snbeams __device__ double atomicAdd(double *address, double val) {
29*7d8d0e25Snbeams   unsigned long long int *address_as_ull = (unsigned long long int *)address;
30*7d8d0e25Snbeams   unsigned long long int old = *address_as_ull, assumed;
31*7d8d0e25Snbeams   do {
32*7d8d0e25Snbeams     assumed = old;
33*7d8d0e25Snbeams     old =
34*7d8d0e25Snbeams       atomicCAS(address_as_ull, assumed,
35*7d8d0e25Snbeams                 __double_as_longlong(val +
36*7d8d0e25Snbeams                                      __longlong_as_double(assumed)));
37*7d8d0e25Snbeams     // Note: uses integer comparison to avoid hang in case of NaN
38*7d8d0e25Snbeams     // (since NaN != NaN)
39*7d8d0e25Snbeams   } while (assumed != old);
40*7d8d0e25Snbeams   return __longlong_as_double(old);
41*7d8d0e25Snbeams }
42*7d8d0e25Snbeams );
43*7d8d0e25Snbeams 
44*7d8d0e25Snbeams static const char *deviceFunctions = QUOTE(
45*7d8d0e25Snbeams 
46*7d8d0e25Snbeams //------------------------------------------------------------------------------
47*7d8d0e25Snbeams // Typedefs
48*7d8d0e25Snbeams //------------------------------------------------------------------------------
49*7d8d0e25Snbeams typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
50*7d8d0e25Snbeams typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
51*7d8d0e25Snbeams 
52*7d8d0e25Snbeams typedef struct {
53*7d8d0e25Snbeams   CeedInt tidx;
54*7d8d0e25Snbeams   CeedInt tidy;
55*7d8d0e25Snbeams   CeedInt tidz;
56*7d8d0e25Snbeams   CeedInt tid;
57*7d8d0e25Snbeams   CeedScalar* slice;
58*7d8d0e25Snbeams } BackendData;
59*7d8d0e25Snbeams 
60*7d8d0e25Snbeams //------------------------------------------------------------------------------
61*7d8d0e25Snbeams // Load matrices for basis actions
62*7d8d0e25Snbeams //------------------------------------------------------------------------------
63*7d8d0e25Snbeams template <int P, int Q>
64*7d8d0e25Snbeams inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
65*7d8d0e25Snbeams   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
66*7d8d0e25Snbeams     B[i] = d_B[i];
67*7d8d0e25Snbeams }
68*7d8d0e25Snbeams 
69*7d8d0e25Snbeams //------------------------------------------------------------------------------
70*7d8d0e25Snbeams // 1D
71*7d8d0e25Snbeams //------------------------------------------------------------------------------
72*7d8d0e25Snbeams 
73*7d8d0e25Snbeams //------------------------------------------------------------------------------
74*7d8d0e25Snbeams // L-vector -> E-vector, offsets provided
75*7d8d0e25Snbeams //------------------------------------------------------------------------------
76*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
77*7d8d0e25Snbeams inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
78*7d8d0e25Snbeams   if (data.tidx < P1d) {
79*7d8d0e25Snbeams     const CeedInt node = data.tidx;
80*7d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d];
81*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
82*7d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
83*7d8d0e25Snbeams   }
84*7d8d0e25Snbeams }
85*7d8d0e25Snbeams 
86*7d8d0e25Snbeams //------------------------------------------------------------------------------
87*7d8d0e25Snbeams // L-vector -> E-vector, strided
88*7d8d0e25Snbeams //------------------------------------------------------------------------------
89*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
90*7d8d0e25Snbeams inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
91*7d8d0e25Snbeams   if (data.tidx < P1d) {
92*7d8d0e25Snbeams     const CeedInt node = data.tidx;
93*7d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
94*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
95*7d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
96*7d8d0e25Snbeams   }
97*7d8d0e25Snbeams }
98*7d8d0e25Snbeams 
99*7d8d0e25Snbeams //------------------------------------------------------------------------------
100*7d8d0e25Snbeams // E-vector -> L-vector, offsets provided
101*7d8d0e25Snbeams //------------------------------------------------------------------------------
102*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
103*7d8d0e25Snbeams inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
104*7d8d0e25Snbeams   if (data.tidx < P1d) {
105*7d8d0e25Snbeams     const CeedInt node = data.tidx;
106*7d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d];
107*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
108*7d8d0e25Snbeams       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
109*7d8d0e25Snbeams   }
110*7d8d0e25Snbeams }
111*7d8d0e25Snbeams 
112*7d8d0e25Snbeams //------------------------------------------------------------------------------
113*7d8d0e25Snbeams // E-vector -> L-vector, strided
114*7d8d0e25Snbeams //------------------------------------------------------------------------------
115*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
116*7d8d0e25Snbeams inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
117*7d8d0e25Snbeams   if (data.tidx < P1d) {
118*7d8d0e25Snbeams     const CeedInt node = data.tidx;
119*7d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
120*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
121*7d8d0e25Snbeams       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
122*7d8d0e25Snbeams   }
123*7d8d0e25Snbeams }
124*7d8d0e25Snbeams 
125*7d8d0e25Snbeams //------------------------------------------------------------------------------
126*7d8d0e25Snbeams // 1D tensor contraction x
127*7d8d0e25Snbeams //------------------------------------------------------------------------------
128*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
129*7d8d0e25Snbeams inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130*7d8d0e25Snbeams   data.slice[data.tidx] = *U;
131*7d8d0e25Snbeams   __syncthreads();
132*7d8d0e25Snbeams   *V = 0.0;
133*7d8d0e25Snbeams   if (data.tidx < Q1d)
134*7d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
135*7d8d0e25Snbeams       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
136*7d8d0e25Snbeams   __syncthreads();
137*7d8d0e25Snbeams }
138*7d8d0e25Snbeams 
139*7d8d0e25Snbeams //------------------------------------------------------------------------------
140*7d8d0e25Snbeams // 1D transpose tensor contraction x
141*7d8d0e25Snbeams //------------------------------------------------------------------------------
142*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
143*7d8d0e25Snbeams inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
144*7d8d0e25Snbeams   data.slice[data.tidx] = *U;
145*7d8d0e25Snbeams   __syncthreads();
146*7d8d0e25Snbeams   *V = 0.0;
147*7d8d0e25Snbeams   if (data.tidx < P1d)
148*7d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
149*7d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
150*7d8d0e25Snbeams   __syncthreads();
151*7d8d0e25Snbeams }
152*7d8d0e25Snbeams 
153*7d8d0e25Snbeams //------------------------------------------------------------------------------
154*7d8d0e25Snbeams // 1D interpolate to quadrature points
155*7d8d0e25Snbeams //------------------------------------------------------------------------------
156*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
157*7d8d0e25Snbeams inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
158*7d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
159*7d8d0e25Snbeams     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
160*7d8d0e25Snbeams }
161*7d8d0e25Snbeams 
162*7d8d0e25Snbeams //------------------------------------------------------------------------------
163*7d8d0e25Snbeams // 1D interpolate transpose
164*7d8d0e25Snbeams //------------------------------------------------------------------------------
165*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
166*7d8d0e25Snbeams inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
167*7d8d0e25Snbeams   for (CeedInt comp=0; comp<NCOMP; comp++)
168*7d8d0e25Snbeams     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
169*7d8d0e25Snbeams }
170*7d8d0e25Snbeams 
171*7d8d0e25Snbeams //------------------------------------------------------------------------------
172*7d8d0e25Snbeams // 1D derivatives at quadrature points
173*7d8d0e25Snbeams //------------------------------------------------------------------------------
174*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
175*7d8d0e25Snbeams inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
176*7d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
177*7d8d0e25Snbeams     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
178*7d8d0e25Snbeams }
179*7d8d0e25Snbeams 
180*7d8d0e25Snbeams //------------------------------------------------------------------------------
181*7d8d0e25Snbeams // 1D derivatives transpose
182*7d8d0e25Snbeams //------------------------------------------------------------------------------
183*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
184*7d8d0e25Snbeams inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
185*7d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
186*7d8d0e25Snbeams     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
187*7d8d0e25Snbeams }
188*7d8d0e25Snbeams 
189*7d8d0e25Snbeams //------------------------------------------------------------------------------
190*7d8d0e25Snbeams // 2D
191*7d8d0e25Snbeams //------------------------------------------------------------------------------
192*7d8d0e25Snbeams 
193*7d8d0e25Snbeams //------------------------------------------------------------------------------
194*7d8d0e25Snbeams // L-vector -> E-vector, offsets provided
195*7d8d0e25Snbeams //------------------------------------------------------------------------------
196*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
197*7d8d0e25Snbeams inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
198*7d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
199*7d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
200*7d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d*P1d];
201*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
202*7d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
203*7d8d0e25Snbeams   }
204*7d8d0e25Snbeams }
205*7d8d0e25Snbeams 
206*7d8d0e25Snbeams //------------------------------------------------------------------------------
207*7d8d0e25Snbeams // L-vector -> E-vector, strided
208*7d8d0e25Snbeams //------------------------------------------------------------------------------
209*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
210*7d8d0e25Snbeams inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
211*7d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
212*7d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
213*7d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
214*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
215*7d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
216*7d8d0e25Snbeams   }
217*7d8d0e25Snbeams }
218*7d8d0e25Snbeams 
219*7d8d0e25Snbeams //------------------------------------------------------------------------------
220*7d8d0e25Snbeams // E-vector -> L-vector, offsets provided
221*7d8d0e25Snbeams //------------------------------------------------------------------------------
222*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
223*7d8d0e25Snbeams inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
224*7d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
225*7d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
226*7d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d*P1d];
227*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
228*7d8d0e25Snbeams       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
229*7d8d0e25Snbeams   }
230*7d8d0e25Snbeams }
231*7d8d0e25Snbeams 
232*7d8d0e25Snbeams //------------------------------------------------------------------------------
233*7d8d0e25Snbeams // E-vector -> L-vector, strided
234*7d8d0e25Snbeams //------------------------------------------------------------------------------
235*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
236*7d8d0e25Snbeams inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
237*7d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
238*7d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
239*7d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
240*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
241*7d8d0e25Snbeams       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
242*7d8d0e25Snbeams   }
243*7d8d0e25Snbeams }
244*7d8d0e25Snbeams 
245*7d8d0e25Snbeams //------------------------------------------------------------------------------
246*7d8d0e25Snbeams // 2D tensor contraction x
247*7d8d0e25Snbeams //------------------------------------------------------------------------------
248*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
249*7d8d0e25Snbeams inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
250*7d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
251*7d8d0e25Snbeams   __syncthreads();
252*7d8d0e25Snbeams   *V = 0.0;
253*7d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < P1d)
254*7d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
255*7d8d0e25Snbeams       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
256*7d8d0e25Snbeams   __syncthreads();
257*7d8d0e25Snbeams }
258*7d8d0e25Snbeams 
259*7d8d0e25Snbeams //------------------------------------------------------------------------------
260*7d8d0e25Snbeams // 2D tensor contract y
261*7d8d0e25Snbeams //------------------------------------------------------------------------------
262*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
263*7d8d0e25Snbeams inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
264*7d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
265*7d8d0e25Snbeams   __syncthreads();
266*7d8d0e25Snbeams   *V = 0.0;
267*7d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d)
268*7d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
269*7d8d0e25Snbeams       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
270*7d8d0e25Snbeams   __syncthreads();
271*7d8d0e25Snbeams }
272*7d8d0e25Snbeams 
273*7d8d0e25Snbeams //------------------------------------------------------------------------------
274*7d8d0e25Snbeams // 2D transpose tensor contract y
275*7d8d0e25Snbeams //------------------------------------------------------------------------------
276*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
277*7d8d0e25Snbeams inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
278*7d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
279*7d8d0e25Snbeams   __syncthreads();
280*7d8d0e25Snbeams   *V = 0.0;
281*7d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < P1d)
282*7d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
283*7d8d0e25Snbeams       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
284*7d8d0e25Snbeams   __syncthreads();
285*7d8d0e25Snbeams }
286*7d8d0e25Snbeams 
287*7d8d0e25Snbeams //------------------------------------------------------------------------------
288*7d8d0e25Snbeams // 2D transpose tensor contract x
289*7d8d0e25Snbeams //------------------------------------------------------------------------------
290*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
291*7d8d0e25Snbeams inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
292*7d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
293*7d8d0e25Snbeams   __syncthreads();
294*7d8d0e25Snbeams   *V = 0.0;
295*7d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
296*7d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
297*7d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
298*7d8d0e25Snbeams   __syncthreads();
299*7d8d0e25Snbeams }
300*7d8d0e25Snbeams 
301*7d8d0e25Snbeams //------------------------------------------------------------------------------
302*7d8d0e25Snbeams // 2D transpose tensor contract and add x
303*7d8d0e25Snbeams //------------------------------------------------------------------------------
304*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
305*7d8d0e25Snbeams inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
306*7d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
307*7d8d0e25Snbeams   __syncthreads();
308*7d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
309*7d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
310*7d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
311*7d8d0e25Snbeams   __syncthreads();
312*7d8d0e25Snbeams }
313*7d8d0e25Snbeams 
314*7d8d0e25Snbeams //------------------------------------------------------------------------------
315*7d8d0e25Snbeams // 2D interpolate to quadrature points
316*7d8d0e25Snbeams //------------------------------------------------------------------------------
317*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
318*7d8d0e25Snbeams inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
319*7d8d0e25Snbeams   CeedScalar r_t[1];
320*7d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
321*7d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
322*7d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
323*7d8d0e25Snbeams   }
324*7d8d0e25Snbeams }
325*7d8d0e25Snbeams 
326*7d8d0e25Snbeams //------------------------------------------------------------------------------
327*7d8d0e25Snbeams // 2D interpolate transpose
328*7d8d0e25Snbeams //------------------------------------------------------------------------------
329*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
330*7d8d0e25Snbeams inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
331*7d8d0e25Snbeams   CeedScalar r_t[1];
332*7d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
333*7d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
334*7d8d0e25Snbeams     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
335*7d8d0e25Snbeams   }
336*7d8d0e25Snbeams }
337*7d8d0e25Snbeams 
338*7d8d0e25Snbeams //------------------------------------------------------------------------------
339*7d8d0e25Snbeams // 2D derivatives at quadrature points
340*7d8d0e25Snbeams //------------------------------------------------------------------------------
341*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
342*7d8d0e25Snbeams inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
343*7d8d0e25Snbeams   CeedScalar r_t[1];
344*7d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
345*7d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
346*7d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
347*7d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
348*7d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
349*7d8d0e25Snbeams   }
350*7d8d0e25Snbeams }
351*7d8d0e25Snbeams 
352*7d8d0e25Snbeams //------------------------------------------------------------------------------
353*7d8d0e25Snbeams // 2D derivatives transpose
354*7d8d0e25Snbeams //------------------------------------------------------------------------------
355*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
356*7d8d0e25Snbeams inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
357*7d8d0e25Snbeams   CeedScalar r_t[1];
358*7d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
359*7d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
360*7d8d0e25Snbeams     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
361*7d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
362*7d8d0e25Snbeams     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
363*7d8d0e25Snbeams   }
364*7d8d0e25Snbeams }
365*7d8d0e25Snbeams 
366*7d8d0e25Snbeams //------------------------------------------------------------------------------
367*7d8d0e25Snbeams // 3D
368*7d8d0e25Snbeams //------------------------------------------------------------------------------
369*7d8d0e25Snbeams 
370*7d8d0e25Snbeams //------------------------------------------------------------------------------
371*7d8d0e25Snbeams // L-vector -> E-vector, offsets provided
372*7d8d0e25Snbeams //------------------------------------------------------------------------------
373*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
374*7d8d0e25Snbeams inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
375*7d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
376*7d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
377*7d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
378*7d8d0e25Snbeams       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
379*7d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
380*7d8d0e25Snbeams         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
381*7d8d0e25Snbeams     }
382*7d8d0e25Snbeams }
383*7d8d0e25Snbeams 
384*7d8d0e25Snbeams //------------------------------------------------------------------------------
385*7d8d0e25Snbeams // L-vector -> E-vector, strided
386*7d8d0e25Snbeams //------------------------------------------------------------------------------
387*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
388*7d8d0e25Snbeams inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
389*7d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
390*7d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
391*7d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
392*7d8d0e25Snbeams       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
393*7d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
394*7d8d0e25Snbeams         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
395*7d8d0e25Snbeams     }
396*7d8d0e25Snbeams }
397*7d8d0e25Snbeams 
398*7d8d0e25Snbeams //------------------------------------------------------------------------------
399*7d8d0e25Snbeams // E-vector -> Q-vector, offests provided
400*7d8d0e25Snbeams //------------------------------------------------------------------------------
401*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int Q1d>
402*7d8d0e25Snbeams inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
403*7d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
404*7d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
405*7d8d0e25Snbeams     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
406*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
407*7d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
408*7d8d0e25Snbeams   }
409*7d8d0e25Snbeams }
410*7d8d0e25Snbeams 
411*7d8d0e25Snbeams //------------------------------------------------------------------------------
412*7d8d0e25Snbeams // E-vector -> Q-vector, strided
413*7d8d0e25Snbeams //------------------------------------------------------------------------------
414*7d8d0e25Snbeams template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
415*7d8d0e25Snbeams inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
416*7d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
417*7d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
418*7d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
419*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
420*7d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
421*7d8d0e25Snbeams   }
422*7d8d0e25Snbeams }
423*7d8d0e25Snbeams 
424*7d8d0e25Snbeams //------------------------------------------------------------------------------
425*7d8d0e25Snbeams // E-vector -> L-vector, offsets provided
426*7d8d0e25Snbeams //------------------------------------------------------------------------------
427*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
428*7d8d0e25Snbeams inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
429*7d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
430*7d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
431*7d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
432*7d8d0e25Snbeams       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
433*7d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
434*7d8d0e25Snbeams         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
435*7d8d0e25Snbeams     }
436*7d8d0e25Snbeams }
437*7d8d0e25Snbeams 
438*7d8d0e25Snbeams //------------------------------------------------------------------------------
439*7d8d0e25Snbeams // E-vector -> L-vector, strided
440*7d8d0e25Snbeams //------------------------------------------------------------------------------
441*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
442*7d8d0e25Snbeams inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
443*7d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
444*7d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
445*7d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
446*7d8d0e25Snbeams       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
447*7d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
448*7d8d0e25Snbeams         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
449*7d8d0e25Snbeams     }
450*7d8d0e25Snbeams }
451*7d8d0e25Snbeams 
452*7d8d0e25Snbeams //------------------------------------------------------------------------------
453*7d8d0e25Snbeams // 3D tensor contract x
454*7d8d0e25Snbeams //------------------------------------------------------------------------------
455*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
456*7d8d0e25Snbeams inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
457*7d8d0e25Snbeams   CeedScalar r_B[P1d];
458*7d8d0e25Snbeams   for (CeedInt i = 0; i < P1d; ++i)
459*7d8d0e25Snbeams     r_B[i] = B[i + data.tidx*P1d];
460*7d8d0e25Snbeams 
461*7d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
462*7d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
463*7d8d0e25Snbeams     __syncthreads();
464*7d8d0e25Snbeams     V[k] = 0.0;
465*7d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
466*7d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
467*7d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
468*7d8d0e25Snbeams     __syncthreads();
469*7d8d0e25Snbeams   }
470*7d8d0e25Snbeams }
471*7d8d0e25Snbeams 
472*7d8d0e25Snbeams //------------------------------------------------------------------------------
473*7d8d0e25Snbeams // 3D tensor contract y
474*7d8d0e25Snbeams //------------------------------------------------------------------------------
475*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
476*7d8d0e25Snbeams inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
477*7d8d0e25Snbeams   CeedScalar r_B[P1d];
478*7d8d0e25Snbeams   for (CeedInt i = 0; i < P1d; ++i)
479*7d8d0e25Snbeams     r_B[i] = B[i + data.tidy*P1d];
480*7d8d0e25Snbeams 
481*7d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
482*7d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
483*7d8d0e25Snbeams     __syncthreads();
484*7d8d0e25Snbeams     V[k] = 0.0;
485*7d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
486*7d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
487*7d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
488*7d8d0e25Snbeams     __syncthreads();
489*7d8d0e25Snbeams   }
490*7d8d0e25Snbeams }
491*7d8d0e25Snbeams 
492*7d8d0e25Snbeams //------------------------------------------------------------------------------
493*7d8d0e25Snbeams // 3D tensor contract z
494*7d8d0e25Snbeams //------------------------------------------------------------------------------
495*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
496*7d8d0e25Snbeams inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
497*7d8d0e25Snbeams   for (CeedInt k = 0; k < Q1d; ++k) {
498*7d8d0e25Snbeams     V[k] = 0.0;
499*7d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
500*7d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
501*7d8d0e25Snbeams         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
502*7d8d0e25Snbeams   }
503*7d8d0e25Snbeams }
504*7d8d0e25Snbeams 
505*7d8d0e25Snbeams //------------------------------------------------------------------------------
506*7d8d0e25Snbeams // 3D transpose tensor contract z
507*7d8d0e25Snbeams //------------------------------------------------------------------------------
508*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
509*7d8d0e25Snbeams inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
510*7d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
511*7d8d0e25Snbeams     V[k] = 0.0;
512*7d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
513*7d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
514*7d8d0e25Snbeams         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
515*7d8d0e25Snbeams   }
516*7d8d0e25Snbeams }
517*7d8d0e25Snbeams 
518*7d8d0e25Snbeams //------------------------------------------------------------------------------
519*7d8d0e25Snbeams // 3D transpose tensor contract y
520*7d8d0e25Snbeams //------------------------------------------------------------------------------
521*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
522*7d8d0e25Snbeams inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
523*7d8d0e25Snbeams   CeedScalar r_B[Q1d];
524*7d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
525*7d8d0e25Snbeams     r_B[i] = B[data.tidy + i*P1d];
526*7d8d0e25Snbeams 
527*7d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
528*7d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
529*7d8d0e25Snbeams     __syncthreads();
530*7d8d0e25Snbeams     V[k] = 0.0;
531*7d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
532*7d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
533*7d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
534*7d8d0e25Snbeams     __syncthreads();
535*7d8d0e25Snbeams   }
536*7d8d0e25Snbeams }
537*7d8d0e25Snbeams 
538*7d8d0e25Snbeams //------------------------------------------------------------------------------
539*7d8d0e25Snbeams // 3D transpose tensor contract add y
540*7d8d0e25Snbeams //------------------------------------------------------------------------------
541*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
542*7d8d0e25Snbeams inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
543*7d8d0e25Snbeams   CeedScalar r_B[Q1d];
544*7d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
545*7d8d0e25Snbeams     r_B[i] = B[data.tidy + i*P1d];
546*7d8d0e25Snbeams 
547*7d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
548*7d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
549*7d8d0e25Snbeams     __syncthreads();
550*7d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
551*7d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
552*7d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
553*7d8d0e25Snbeams     __syncthreads();
554*7d8d0e25Snbeams   }
555*7d8d0e25Snbeams }
556*7d8d0e25Snbeams 
557*7d8d0e25Snbeams //------------------------------------------------------------------------------
558*7d8d0e25Snbeams // 3D transpose tensor contract x
559*7d8d0e25Snbeams //------------------------------------------------------------------------------
560*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
561*7d8d0e25Snbeams inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
562*7d8d0e25Snbeams   CeedScalar r_B[Q1d];
563*7d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
564*7d8d0e25Snbeams     r_B[i] = B[data.tidx + i*P1d];
565*7d8d0e25Snbeams 
566*7d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
567*7d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
568*7d8d0e25Snbeams     __syncthreads();
569*7d8d0e25Snbeams     V[k] = 0.0;
570*7d8d0e25Snbeams     if (data.tidx < P1d && data.tidy < P1d)
571*7d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
572*7d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
573*7d8d0e25Snbeams     __syncthreads();
574*7d8d0e25Snbeams   }
575*7d8d0e25Snbeams }
576*7d8d0e25Snbeams 
577*7d8d0e25Snbeams //------------------------------------------------------------------------------
578*7d8d0e25Snbeams // 3D transpose tensor contract add x
579*7d8d0e25Snbeams //------------------------------------------------------------------------------
580*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
581*7d8d0e25Snbeams inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
582*7d8d0e25Snbeams   CeedScalar r_B[Q1d];
583*7d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
584*7d8d0e25Snbeams     r_B[i] = B[data.tidx + i*P1d];
585*7d8d0e25Snbeams 
586*7d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
587*7d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
588*7d8d0e25Snbeams     __syncthreads();
589*7d8d0e25Snbeams     if (data.tidx < P1d && data.tidy < P1d)
590*7d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
591*7d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
592*7d8d0e25Snbeams     __syncthreads();
593*7d8d0e25Snbeams   }
594*7d8d0e25Snbeams }
595*7d8d0e25Snbeams 
596*7d8d0e25Snbeams //------------------------------------------------------------------------------
597*7d8d0e25Snbeams // 3D interpolate to quadrature points
598*7d8d0e25Snbeams //------------------------------------------------------------------------------
599*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
600*7d8d0e25Snbeams inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
601*7d8d0e25Snbeams   CeedScalar r_t1[T1d];
602*7d8d0e25Snbeams   CeedScalar r_t2[T1d];
603*7d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
604*7d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
605*7d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
606*7d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
607*7d8d0e25Snbeams   }
608*7d8d0e25Snbeams }
609*7d8d0e25Snbeams 
610*7d8d0e25Snbeams //------------------------------------------------------------------------------
611*7d8d0e25Snbeams // 3D interpolate transpose
612*7d8d0e25Snbeams //------------------------------------------------------------------------------
613*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
614*7d8d0e25Snbeams inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
615*7d8d0e25Snbeams   CeedScalar r_t1[T1d];
616*7d8d0e25Snbeams   CeedScalar r_t2[T1d];
617*7d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
618*7d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
619*7d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
620*7d8d0e25Snbeams     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
621*7d8d0e25Snbeams   }
622*7d8d0e25Snbeams }
623*7d8d0e25Snbeams 
624*7d8d0e25Snbeams //------------------------------------------------------------------------------
625*7d8d0e25Snbeams // 3D derivatives at quadrature points
626*7d8d0e25Snbeams //------------------------------------------------------------------------------
627*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
628*7d8d0e25Snbeams inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
629*7d8d0e25Snbeams   CeedScalar r_t1[T1d];
630*7d8d0e25Snbeams   CeedScalar r_t2[T1d];
631*7d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
632*7d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
633*7d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
634*7d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
635*7d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
636*7d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
637*7d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
638*7d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
639*7d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
640*7d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
641*7d8d0e25Snbeams   }
642*7d8d0e25Snbeams }
643*7d8d0e25Snbeams 
644*7d8d0e25Snbeams //------------------------------------------------------------------------------
645*7d8d0e25Snbeams // 3D derivatives transpose
646*7d8d0e25Snbeams //------------------------------------------------------------------------------
647*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
648*7d8d0e25Snbeams inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
649*7d8d0e25Snbeams   CeedScalar r_t1[T1d];
650*7d8d0e25Snbeams   CeedScalar r_t2[T1d];
651*7d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
652*7d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
653*7d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
654*7d8d0e25Snbeams     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
655*7d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
656*7d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
657*7d8d0e25Snbeams     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
658*7d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
659*7d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
660*7d8d0e25Snbeams     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
661*7d8d0e25Snbeams   }
662*7d8d0e25Snbeams }
663*7d8d0e25Snbeams 
664*7d8d0e25Snbeams //------------------------------------------------------------------------------
665*7d8d0e25Snbeams // 3D collocated derivatives computation
666*7d8d0e25Snbeams //------------------------------------------------------------------------------
667*7d8d0e25Snbeams template <int NCOMP, int Q1d>
668*7d8d0e25Snbeams inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
669*7d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
670*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
671*7d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
672*7d8d0e25Snbeams       __syncthreads();
673*7d8d0e25Snbeams       // X derivative
674*7d8d0e25Snbeams       r_V[comp+0*NCOMP] = 0.0;
675*7d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
676*7d8d0e25Snbeams         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
677*7d8d0e25Snbeams       // Y derivative
678*7d8d0e25Snbeams       r_V[comp+1*NCOMP] = 0.0;
679*7d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
680*7d8d0e25Snbeams         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
681*7d8d0e25Snbeams       // Z derivative
682*7d8d0e25Snbeams       r_V[comp+2*NCOMP] = 0.0;
683*7d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
684*7d8d0e25Snbeams         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
685*7d8d0e25Snbeams       __syncthreads();
686*7d8d0e25Snbeams     }
687*7d8d0e25Snbeams   }
688*7d8d0e25Snbeams }
689*7d8d0e25Snbeams 
690*7d8d0e25Snbeams //------------------------------------------------------------------------------
691*7d8d0e25Snbeams // 3D collocated derivatives transpose
692*7d8d0e25Snbeams //------------------------------------------------------------------------------
693*7d8d0e25Snbeams template <int NCOMP, int Q1d>
694*7d8d0e25Snbeams inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
695*7d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
696*7d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
697*7d8d0e25Snbeams       // X derivative
698*7d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
699*7d8d0e25Snbeams       __syncthreads();
700*7d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
701*7d8d0e25Snbeams         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
702*7d8d0e25Snbeams       __syncthreads();
703*7d8d0e25Snbeams       // Y derivative
704*7d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
705*7d8d0e25Snbeams       __syncthreads();
706*7d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
707*7d8d0e25Snbeams         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
708*7d8d0e25Snbeams       __syncthreads();
709*7d8d0e25Snbeams       // Z derivative
710*7d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
711*7d8d0e25Snbeams         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
712*7d8d0e25Snbeams     }
713*7d8d0e25Snbeams   }
714*7d8d0e25Snbeams }
715*7d8d0e25Snbeams 
716*7d8d0e25Snbeams //------------------------------------------------------------------------------
717*7d8d0e25Snbeams // 1D quadrature weights
718*7d8d0e25Snbeams //------------------------------------------------------------------------------
719*7d8d0e25Snbeams template <int Q1d>
720*7d8d0e25Snbeams inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
721*7d8d0e25Snbeams   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
722*7d8d0e25Snbeams }
723*7d8d0e25Snbeams 
724*7d8d0e25Snbeams //------------------------------------------------------------------------------
725*7d8d0e25Snbeams // 2D quadrature weights
726*7d8d0e25Snbeams //------------------------------------------------------------------------------
727*7d8d0e25Snbeams template <int Q1d>
728*7d8d0e25Snbeams inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
729*7d8d0e25Snbeams   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
730*7d8d0e25Snbeams         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
731*7d8d0e25Snbeams }
732*7d8d0e25Snbeams 
733*7d8d0e25Snbeams //------------------------------------------------------------------------------
734*7d8d0e25Snbeams // 3D quadrature weights
735*7d8d0e25Snbeams //------------------------------------------------------------------------------
736*7d8d0e25Snbeams template <int Q1d>
737*7d8d0e25Snbeams inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
738*7d8d0e25Snbeams   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
739*7d8d0e25Snbeams   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
740*7d8d0e25Snbeams   for (CeedInt z = 0; z < Q1d; ++z)
741*7d8d0e25Snbeams     w[z] = quad ? pw*qweight1d[z] : 0.0;
742*7d8d0e25Snbeams }
743*7d8d0e25Snbeams 
744*7d8d0e25Snbeams );
745*7d8d0e25Snbeams //------------------------------------------------------------------------------
746*7d8d0e25Snbeams // Build singe operator kernel
747*7d8d0e25Snbeams //------------------------------------------------------------------------------
748*7d8d0e25Snbeams extern "C" int CeedHipGenOperatorBuild(CeedOperator op) {
749*7d8d0e25Snbeams 
750*7d8d0e25Snbeams   using std::ostringstream;
751*7d8d0e25Snbeams   using std::string;
752*7d8d0e25Snbeams   int ierr;
753*7d8d0e25Snbeams   bool setupdone;
754*7d8d0e25Snbeams   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr);
755*7d8d0e25Snbeams   if (setupdone) return 0;
756*7d8d0e25Snbeams   Ceed ceed;
757*7d8d0e25Snbeams   ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
758*7d8d0e25Snbeams   CeedOperator_Hip_gen *data;
759*7d8d0e25Snbeams   ierr = CeedOperatorGetData(op, &data); CeedChk(ierr);
760*7d8d0e25Snbeams   CeedQFunction qf;
761*7d8d0e25Snbeams   CeedQFunction_Hip_gen *qf_data;
762*7d8d0e25Snbeams   ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
763*7d8d0e25Snbeams   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr);
764*7d8d0e25Snbeams   CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields,
765*7d8d0e25Snbeams           numoutputfields, ncomp, dim = 0, lsize;
766*7d8d0e25Snbeams   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
767*7d8d0e25Snbeams   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
768*7d8d0e25Snbeams   ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
769*7d8d0e25Snbeams   CeedChk(ierr);
770*7d8d0e25Snbeams   CeedOperatorField *opinputfields, *opoutputfields;
771*7d8d0e25Snbeams   ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
772*7d8d0e25Snbeams   CeedChk(ierr);
773*7d8d0e25Snbeams   CeedQFunctionField *qfinputfields, *qfoutputfields;
774*7d8d0e25Snbeams   ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
775*7d8d0e25Snbeams   CeedChk(ierr);
776*7d8d0e25Snbeams   CeedEvalMode emode;
777*7d8d0e25Snbeams   CeedBasis basis;
778*7d8d0e25Snbeams   CeedBasis_Hip_shared *basis_data;
779*7d8d0e25Snbeams   CeedElemRestriction Erestrict;
780*7d8d0e25Snbeams   CeedElemRestriction_Hip *restr_data;
781*7d8d0e25Snbeams 
782*7d8d0e25Snbeams   ostringstream code;
783*7d8d0e25Snbeams   string devFunctions(deviceFunctions);
784*7d8d0e25Snbeams 
785*7d8d0e25Snbeams   // Add atomicAdd function for old NVidia architectures
786*7d8d0e25Snbeams   struct hipDeviceProp_t prop;
787*7d8d0e25Snbeams   Ceed_Hip *ceed_data;
788*7d8d0e25Snbeams   ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr);
789*7d8d0e25Snbeams   ierr = hipGetDeviceProperties(&prop, ceed_data->deviceId);
790*7d8d0e25Snbeams   if (prop.major<6){
791*7d8d0e25Snbeams     code << atomicAdd;
792*7d8d0e25Snbeams   }
793*7d8d0e25Snbeams 
794*7d8d0e25Snbeams   code << devFunctions;
795*7d8d0e25Snbeams 
796*7d8d0e25Snbeams   string qFunction(qf_data->qFunctionSource);
797*7d8d0e25Snbeams   string qFunctionName(qf_data->qFunctionName);
798*7d8d0e25Snbeams   string oper;
799*7d8d0e25Snbeams   oper = "CeedKernel_Hip_gen_" + qFunctionName;
800*7d8d0e25Snbeams 
801*7d8d0e25Snbeams   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
802*7d8d0e25Snbeams   code << "\n#define CeedPragmaSIMD\n";
803*7d8d0e25Snbeams 
804*7d8d0e25Snbeams   // Find dim and Q1d
805*7d8d0e25Snbeams   bool useCollograd = true;
806*7d8d0e25Snbeams   data->maxP1d = 0;
807*7d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
808*7d8d0e25Snbeams     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
809*7d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
810*7d8d0e25Snbeams       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
811*7d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
812*7d8d0e25Snbeams       CeedChk(ierr);
813*7d8d0e25Snbeams 
814*7d8d0e25Snbeams       // Check for collocated gradient
815*7d8d0e25Snbeams       useCollograd = useCollograd && basis_data->d_collograd1d;
816*7d8d0e25Snbeams 
817*7d8d0e25Snbeams       // Collect dim and Q1d
818*7d8d0e25Snbeams       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
819*7d8d0e25Snbeams       bool isTensor;
820*7d8d0e25Snbeams       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
821*7d8d0e25Snbeams       if (isTensor) {
822*7d8d0e25Snbeams         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
823*7d8d0e25Snbeams         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
824*7d8d0e25Snbeams         if (P1d>data->maxP1d) data->maxP1d = P1d;
825*7d8d0e25Snbeams       } else {
826*7d8d0e25Snbeams         // LCOV_EXCL_START
827*7d8d0e25Snbeams         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
828*7d8d0e25Snbeams         // LCOV_EXCL_STOP
829*7d8d0e25Snbeams         }
830*7d8d0e25Snbeams     }
831*7d8d0e25Snbeams   }
832*7d8d0e25Snbeams   // Check output bases for Q1d, dim as well
833*7d8d0e25Snbeams   //   The only imput basis might be CEED_BASIS_COLLOCATED
834*7d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
835*7d8d0e25Snbeams     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
836*7d8d0e25Snbeams 
837*7d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
838*7d8d0e25Snbeams       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
839*7d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
840*7d8d0e25Snbeams       CeedChk(ierr);
841*7d8d0e25Snbeams 
842*7d8d0e25Snbeams       // Collect dim and Q1d
843*7d8d0e25Snbeams       ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
844*7d8d0e25Snbeams       bool isTensor;
845*7d8d0e25Snbeams       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr);
846*7d8d0e25Snbeams       if (isTensor) {
847*7d8d0e25Snbeams         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
848*7d8d0e25Snbeams       } else {
849*7d8d0e25Snbeams         // LCOV_EXCL_START
850*7d8d0e25Snbeams         return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
851*7d8d0e25Snbeams         // LCOV_EXCL_STOP
852*7d8d0e25Snbeams         }
853*7d8d0e25Snbeams 
854*7d8d0e25Snbeams       // Check for collocated gradient
855*7d8d0e25Snbeams       useCollograd = useCollograd && basis_data->d_collograd1d;
856*7d8d0e25Snbeams     }
857*7d8d0e25Snbeams   }
858*7d8d0e25Snbeams   data->dim = dim;
859*7d8d0e25Snbeams   data->Q1d = Q1d;
860*7d8d0e25Snbeams 
861*7d8d0e25Snbeams   // Define CEED_Q_VLA
862*7d8d0e25Snbeams   if (dim != 3 || useCollograd) {
863*7d8d0e25Snbeams     code << "\n#define CEED_Q_VLA 1\n\n";
864*7d8d0e25Snbeams   } else {
865*7d8d0e25Snbeams     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
866*7d8d0e25Snbeams   }
867*7d8d0e25Snbeams 
868*7d8d0e25Snbeams   code << qFunction;
869*7d8d0e25Snbeams 
870*7d8d0e25Snbeams   // Setup
871*7d8d0e25Snbeams   code << "\n// -----------------------------------------------------------------------------\n";
872*7d8d0e25Snbeams   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
873*7d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
874*7d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
875*7d8d0e25Snbeams     CeedChk(ierr);
876*7d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
877*7d8d0e25Snbeams       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
878*7d8d0e25Snbeams     }
879*7d8d0e25Snbeams   }
880*7d8d0e25Snbeams 
881*7d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
882*7d8d0e25Snbeams     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
883*7d8d0e25Snbeams   }
884*7d8d0e25Snbeams 
885*7d8d0e25Snbeams   code << "  const CeedInt Dim = "<<dim<<";\n";
886*7d8d0e25Snbeams   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
887*7d8d0e25Snbeams 
888*7d8d0e25Snbeams   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
889*7d8d0e25Snbeams   code << "  BackendData data;\n";
890*7d8d0e25Snbeams   code << "  data.tidx = threadIdx.x;\n";
891*7d8d0e25Snbeams   code << "  data.tidy = threadIdx.y;\n";
892*7d8d0e25Snbeams   code << "  data.tidz = threadIdx.z;\n";
893*7d8d0e25Snbeams   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
894*7d8d0e25Snbeams   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
895*7d8d0e25Snbeams 
896*7d8d0e25Snbeams   code << "\n  // -- Input field constants and basis data --\n";
897*7d8d0e25Snbeams   //Initialize constants, and matrices B and G
898*7d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
899*7d8d0e25Snbeams     code << "  // ---- Input field "<<i<<" ----\n";
900*7d8d0e25Snbeams     // Get elemsize, emode, ncomp
901*7d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
902*7d8d0e25Snbeams     CeedChk(ierr);
903*7d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
904*7d8d0e25Snbeams     CeedChk(ierr);
905*7d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
906*7d8d0e25Snbeams     CeedChk(ierr);
907*7d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
908*7d8d0e25Snbeams     CeedChk(ierr);
909*7d8d0e25Snbeams 
910*7d8d0e25Snbeams     // Set field constants
911*7d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT) {
912*7d8d0e25Snbeams       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
913*7d8d0e25Snbeams       if (basis != CEED_BASIS_COLLOCATED) {
914*7d8d0e25Snbeams         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
915*7d8d0e25Snbeams         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
916*7d8d0e25Snbeams       } else {
917*7d8d0e25Snbeams         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
918*7d8d0e25Snbeams       }
919*7d8d0e25Snbeams       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
920*7d8d0e25Snbeams     }
921*7d8d0e25Snbeams 
922*7d8d0e25Snbeams     // Load basis data
923*7d8d0e25Snbeams     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
924*7d8d0e25Snbeams     switch (emode) {
925*7d8d0e25Snbeams     case CEED_EVAL_NONE:
926*7d8d0e25Snbeams       break;
927*7d8d0e25Snbeams     case CEED_EVAL_INTERP:
928*7d8d0e25Snbeams       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
929*7d8d0e25Snbeams       data->B.in[i] = basis_data->d_interp1d;
930*7d8d0e25Snbeams       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
931*7d8d0e25Snbeams       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
932*7d8d0e25Snbeams       break;
933*7d8d0e25Snbeams     case CEED_EVAL_GRAD:
934*7d8d0e25Snbeams       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
935*7d8d0e25Snbeams       data->B.in[i] = basis_data->d_interp1d;
936*7d8d0e25Snbeams       code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
937*7d8d0e25Snbeams       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
938*7d8d0e25Snbeams       if (useCollograd) {
939*7d8d0e25Snbeams         data->G.in[i] = basis_data->d_collograd1d;
940*7d8d0e25Snbeams         code << "  __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
941*7d8d0e25Snbeams         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
942*7d8d0e25Snbeams       } else {
943*7d8d0e25Snbeams         data->G.in[i] = basis_data->d_grad1d;
944*7d8d0e25Snbeams         code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
945*7d8d0e25Snbeams         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
946*7d8d0e25Snbeams       }
947*7d8d0e25Snbeams       break;
948*7d8d0e25Snbeams     case CEED_EVAL_WEIGHT:
949*7d8d0e25Snbeams       break; // No action
950*7d8d0e25Snbeams     case CEED_EVAL_DIV:
951*7d8d0e25Snbeams       break; // TODO: Not implemented
952*7d8d0e25Snbeams     case CEED_EVAL_CURL:
953*7d8d0e25Snbeams       break; // TODO: Not implemented
954*7d8d0e25Snbeams     }
955*7d8d0e25Snbeams   }
956*7d8d0e25Snbeams 
957*7d8d0e25Snbeams   code << "\n  // -- Output field constants and basis data --\n";
958*7d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
959*7d8d0e25Snbeams     code << "  // ---- Output field "<<i<<" ----\n";
960*7d8d0e25Snbeams     // Get elemsize, emode, ncomp
961*7d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
962*7d8d0e25Snbeams     CeedChk(ierr);
963*7d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
964*7d8d0e25Snbeams     CeedChk(ierr);
965*7d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
966*7d8d0e25Snbeams     CeedChk(ierr);
967*7d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
968*7d8d0e25Snbeams     CeedChk(ierr);
969*7d8d0e25Snbeams 
970*7d8d0e25Snbeams     // Set field constants
971*7d8d0e25Snbeams     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
972*7d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
973*7d8d0e25Snbeams       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
974*7d8d0e25Snbeams       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
975*7d8d0e25Snbeams     } else {
976*7d8d0e25Snbeams       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
977*7d8d0e25Snbeams     }
978*7d8d0e25Snbeams     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
979*7d8d0e25Snbeams 
980*7d8d0e25Snbeams     // Load basis data
981*7d8d0e25Snbeams     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
982*7d8d0e25Snbeams     switch (emode) {
983*7d8d0e25Snbeams     case CEED_EVAL_NONE:
984*7d8d0e25Snbeams       break; // No action
985*7d8d0e25Snbeams     case CEED_EVAL_INTERP:
986*7d8d0e25Snbeams       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
987*7d8d0e25Snbeams       data->B.out[i] = basis_data->d_interp1d;
988*7d8d0e25Snbeams       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
989*7d8d0e25Snbeams       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
990*7d8d0e25Snbeams       break;
991*7d8d0e25Snbeams     case CEED_EVAL_GRAD:
992*7d8d0e25Snbeams       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
993*7d8d0e25Snbeams       data->B.out[i] = basis_data->d_interp1d;
994*7d8d0e25Snbeams       code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
995*7d8d0e25Snbeams       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
996*7d8d0e25Snbeams       if (useCollograd) {
997*7d8d0e25Snbeams         data->G.out[i] = basis_data->d_collograd1d;
998*7d8d0e25Snbeams         code << "  __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
999*7d8d0e25Snbeams         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1000*7d8d0e25Snbeams       } else {
1001*7d8d0e25Snbeams         data->G.out[i] = basis_data->d_grad1d;
1002*7d8d0e25Snbeams         code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
1003*7d8d0e25Snbeams         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
1004*7d8d0e25Snbeams       }
1005*7d8d0e25Snbeams       break;
1006*7d8d0e25Snbeams     // LCOV_EXCL_START
1007*7d8d0e25Snbeams     case CEED_EVAL_WEIGHT: {
1008*7d8d0e25Snbeams       Ceed ceed;
1009*7d8d0e25Snbeams       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1010*7d8d0e25Snbeams       return CeedError(ceed, 1,
1011*7d8d0e25Snbeams                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1012*7d8d0e25Snbeams       break; // Should not occur
1013*7d8d0e25Snbeams     }
1014*7d8d0e25Snbeams     case CEED_EVAL_DIV:
1015*7d8d0e25Snbeams       break; // TODO: Not implemented
1016*7d8d0e25Snbeams     case CEED_EVAL_CURL:
1017*7d8d0e25Snbeams       break; // TODO: Not implemented
1018*7d8d0e25Snbeams       // LCOV_EXCL_STOP
1019*7d8d0e25Snbeams     }
1020*7d8d0e25Snbeams   }
1021*7d8d0e25Snbeams   code << "\n  // -- Element loop --\n";
1022*7d8d0e25Snbeams   code << "  __syncthreads();\n";
1023*7d8d0e25Snbeams   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
1024*7d8d0e25Snbeams   // Input basis apply if needed
1025*7d8d0e25Snbeams   // Generate the correct eval mode code for each input
1026*7d8d0e25Snbeams   code << "    // -- Input field restrictions and basis actions --\n";
1027*7d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
1028*7d8d0e25Snbeams     code << "    // ---- Input field "<<i<<" ----\n";
1029*7d8d0e25Snbeams     // Get elemsize, emode, ncomp
1030*7d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1031*7d8d0e25Snbeams     CeedChk(ierr);
1032*7d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1033*7d8d0e25Snbeams     CeedChk(ierr);
1034*7d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1035*7d8d0e25Snbeams     CeedChk(ierr);
1036*7d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1037*7d8d0e25Snbeams     CeedChk(ierr);
1038*7d8d0e25Snbeams 
1039*7d8d0e25Snbeams     // Restriction
1040*7d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT &&
1041*7d8d0e25Snbeams         !((emode == CEED_EVAL_NONE) && useCollograd)) {
1042*7d8d0e25Snbeams       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
1043*7d8d0e25Snbeams 
1044*7d8d0e25Snbeams       bool isStrided;
1045*7d8d0e25Snbeams       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1046*7d8d0e25Snbeams       if (!isStrided) {
1047*7d8d0e25Snbeams         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1048*7d8d0e25Snbeams         CeedChk(ierr);
1049*7d8d0e25Snbeams         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1050*7d8d0e25Snbeams         CeedInt compstride;
1051*7d8d0e25Snbeams         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1052*7d8d0e25Snbeams         code << "    // CompStride: "<<compstride<<"\n";
1053*7d8d0e25Snbeams         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1054*7d8d0e25Snbeams         data->indices.in[i] = restr_data->d_ind;
1055*7d8d0e25Snbeams         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
1056*7d8d0e25Snbeams       } else {
1057*7d8d0e25Snbeams         bool backendstrides;
1058*7d8d0e25Snbeams         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1059*7d8d0e25Snbeams         CeedChk(ierr);
1060*7d8d0e25Snbeams         CeedInt nelem;
1061*7d8d0e25Snbeams         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1062*7d8d0e25Snbeams         CeedChk(ierr);
1063*7d8d0e25Snbeams         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1064*7d8d0e25Snbeams         if (!backendstrides) {
1065*7d8d0e25Snbeams           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1066*7d8d0e25Snbeams           CeedChk(ierr);
1067*7d8d0e25Snbeams         }
1068*7d8d0e25Snbeams         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1069*7d8d0e25Snbeams         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
1070*7d8d0e25Snbeams       }
1071*7d8d0e25Snbeams     }
1072*7d8d0e25Snbeams 
1073*7d8d0e25Snbeams     // Basis action
1074*7d8d0e25Snbeams     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1075*7d8d0e25Snbeams     switch (emode) {
1076*7d8d0e25Snbeams     case CEED_EVAL_NONE:
1077*7d8d0e25Snbeams       if (!useCollograd) {
1078*7d8d0e25Snbeams         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
1079*7d8d0e25Snbeams       }
1080*7d8d0e25Snbeams       break;
1081*7d8d0e25Snbeams     case CEED_EVAL_INTERP:
1082*7d8d0e25Snbeams       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1083*7d8d0e25Snbeams       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1084*7d8d0e25Snbeams       break;
1085*7d8d0e25Snbeams     case CEED_EVAL_GRAD:
1086*7d8d0e25Snbeams       if (useCollograd) {
1087*7d8d0e25Snbeams         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
1088*7d8d0e25Snbeams         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
1089*7d8d0e25Snbeams       } else {
1090*7d8d0e25Snbeams         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
1091*7d8d0e25Snbeams         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
1092*7d8d0e25Snbeams       }
1093*7d8d0e25Snbeams       break;
1094*7d8d0e25Snbeams     case CEED_EVAL_WEIGHT:
1095*7d8d0e25Snbeams       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1096*7d8d0e25Snbeams       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
1097*7d8d0e25Snbeams       ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr);
1098*7d8d0e25Snbeams       data->W = basis_data->d_qweight1d;
1099*7d8d0e25Snbeams       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
1100*7d8d0e25Snbeams       break; // No action
1101*7d8d0e25Snbeams     case CEED_EVAL_DIV:
1102*7d8d0e25Snbeams       break; // TODO: Not implemented
1103*7d8d0e25Snbeams     case CEED_EVAL_CURL:
1104*7d8d0e25Snbeams       break; // TODO: Not implemented
1105*7d8d0e25Snbeams     }
1106*7d8d0e25Snbeams   }
1107*7d8d0e25Snbeams 
1108*7d8d0e25Snbeams   // Q function
1109*7d8d0e25Snbeams   code << "\n    // -- Output field setup --\n";
1110*7d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
1111*7d8d0e25Snbeams       code << "\n    // ---- Output field "<<i<<" ----\n";
1112*7d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1113*7d8d0e25Snbeams     CeedChk(ierr);
1114*7d8d0e25Snbeams     if (emode==CEED_EVAL_GRAD)
1115*7d8d0e25Snbeams     {
1116*7d8d0e25Snbeams       if (useCollograd) {
1117*7d8d0e25Snbeams         //Accumulator for gradient slices
1118*7d8d0e25Snbeams         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1119*7d8d0e25Snbeams         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
1120*7d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
1121*7d8d0e25Snbeams         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
1122*7d8d0e25Snbeams         code << "      }\n";
1123*7d8d0e25Snbeams         code << "    }\n";
1124*7d8d0e25Snbeams       } else {
1125*7d8d0e25Snbeams         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
1126*7d8d0e25Snbeams       }
1127*7d8d0e25Snbeams     }
1128*7d8d0e25Snbeams     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
1129*7d8d0e25Snbeams     {
1130*7d8d0e25Snbeams       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
1131*7d8d0e25Snbeams     }
1132*7d8d0e25Snbeams   }
1133*7d8d0e25Snbeams   // We treat quadrature points per slice in 3d to save registers
1134*7d8d0e25Snbeams   if (useCollograd) {
1135*7d8d0e25Snbeams     code << "\n    // Note: Collocated Gradient\n";
1136*7d8d0e25Snbeams     code << "#pragma unroll\n";
1137*7d8d0e25Snbeams     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
1138*7d8d0e25Snbeams     code << "      // -- Input fields --\n";
1139*7d8d0e25Snbeams     for (CeedInt i = 0; i < numinputfields; i++) {
1140*7d8d0e25Snbeams       code << "      // ---- Input field "<<i<<" ----\n";
1141*7d8d0e25Snbeams       // Get elemsize, emode, ncomp
1142*7d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1143*7d8d0e25Snbeams       CeedChk(ierr);
1144*7d8d0e25Snbeams       // Basis action
1145*7d8d0e25Snbeams       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1146*7d8d0e25Snbeams       switch (emode) {
1147*7d8d0e25Snbeams       case CEED_EVAL_NONE:
1148*7d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1149*7d8d0e25Snbeams 
1150*7d8d0e25Snbeams         bool isStrided;
1151*7d8d0e25Snbeams         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr);
1152*7d8d0e25Snbeams         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr);
1153*7d8d0e25Snbeams         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1154*7d8d0e25Snbeams         if (!isStrided) {
1155*7d8d0e25Snbeams           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1156*7d8d0e25Snbeams           CeedChk(ierr);
1157*7d8d0e25Snbeams           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
1158*7d8d0e25Snbeams           CeedInt compstride;
1159*7d8d0e25Snbeams           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1160*7d8d0e25Snbeams           code << "      // CompStride: "<<compstride<<"\n";
1161*7d8d0e25Snbeams           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1162*7d8d0e25Snbeams           data->indices.in[i] = restr_data->d_ind;
1163*7d8d0e25Snbeams           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
1164*7d8d0e25Snbeams         } else {
1165*7d8d0e25Snbeams           bool backendstrides;
1166*7d8d0e25Snbeams           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1167*7d8d0e25Snbeams           CeedChk(ierr);
1168*7d8d0e25Snbeams           CeedInt nelem;
1169*7d8d0e25Snbeams           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1170*7d8d0e25Snbeams           CeedChk(ierr);
1171*7d8d0e25Snbeams           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1172*7d8d0e25Snbeams           if (!backendstrides) {
1173*7d8d0e25Snbeams             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1174*7d8d0e25Snbeams             CeedChk(ierr);
1175*7d8d0e25Snbeams           }
1176*7d8d0e25Snbeams           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1177*7d8d0e25Snbeams           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
1178*7d8d0e25Snbeams         }
1179*7d8d0e25Snbeams         break;
1180*7d8d0e25Snbeams       case CEED_EVAL_INTERP:
1181*7d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
1182*7d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
1183*7d8d0e25Snbeams         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
1184*7d8d0e25Snbeams         code << "      }\n";
1185*7d8d0e25Snbeams         break;
1186*7d8d0e25Snbeams       case CEED_EVAL_GRAD:
1187*7d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
1188*7d8d0e25Snbeams         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
1189*7d8d0e25Snbeams         break;
1190*7d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
1191*7d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[1];\n";
1192*7d8d0e25Snbeams         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
1193*7d8d0e25Snbeams         break; // No action
1194*7d8d0e25Snbeams       case CEED_EVAL_DIV:
1195*7d8d0e25Snbeams         break; // TODO: Not implemented
1196*7d8d0e25Snbeams       case CEED_EVAL_CURL:
1197*7d8d0e25Snbeams         break; // TODO: Not implemented
1198*7d8d0e25Snbeams       }
1199*7d8d0e25Snbeams     }
1200*7d8d0e25Snbeams     code << "\n      // -- Output fields --\n";
1201*7d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
1202*7d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
1203*7d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1204*7d8d0e25Snbeams       CeedChk(ierr);
1205*7d8d0e25Snbeams       // Basis action
1206*7d8d0e25Snbeams       switch (emode) {
1207*7d8d0e25Snbeams       case CEED_EVAL_NONE:
1208*7d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1209*7d8d0e25Snbeams         break; // No action
1210*7d8d0e25Snbeams       case CEED_EVAL_INTERP:
1211*7d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
1212*7d8d0e25Snbeams         break;
1213*7d8d0e25Snbeams       case CEED_EVAL_GRAD:
1214*7d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
1215*7d8d0e25Snbeams         break;
1216*7d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
1217*7d8d0e25Snbeams         break; // Should not occur
1218*7d8d0e25Snbeams       case CEED_EVAL_DIV:
1219*7d8d0e25Snbeams         break; // TODO: Not implemented
1220*7d8d0e25Snbeams       case CEED_EVAL_CURL:
1221*7d8d0e25Snbeams         break; // TODO: Not implemented
1222*7d8d0e25Snbeams       }
1223*7d8d0e25Snbeams     }
1224*7d8d0e25Snbeams   } else {
1225*7d8d0e25Snbeams     code << "\n      // Note: No Collocated Gradient\n";
1226*7d8d0e25Snbeams     code << "      // -- Input fields --\n";
1227*7d8d0e25Snbeams     for (CeedInt i = 0; i < numinputfields; i++) {
1228*7d8d0e25Snbeams       code << "      // ---- Input field "<<i<<" ----\n";
1229*7d8d0e25Snbeams       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
1230*7d8d0e25Snbeams     }
1231*7d8d0e25Snbeams     code << "      // -- Output fields --\n";
1232*7d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
1233*7d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
1234*7d8d0e25Snbeams       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
1235*7d8d0e25Snbeams     }
1236*7d8d0e25Snbeams   }
1237*7d8d0e25Snbeams   code << "\n      // -- QFunction Inputs and outputs --\n";
1238*7d8d0e25Snbeams   code << "      CeedScalar* in["<<numinputfields<<"];\n";
1239*7d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
1240*7d8d0e25Snbeams     code << "      // ---- Input field "<<i<<" ----\n";
1241*7d8d0e25Snbeams     code << "      in["<<i<<"] = r_q"<<i<<";\n";
1242*7d8d0e25Snbeams   }
1243*7d8d0e25Snbeams   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
1244*7d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
1245*7d8d0e25Snbeams     code << "      // ---- Output field "<<i<<" ----\n";
1246*7d8d0e25Snbeams     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
1247*7d8d0e25Snbeams   }
1248*7d8d0e25Snbeams   code << "\n      // -- Apply QFunction --\n";
1249*7d8d0e25Snbeams   code << "      "<<qFunctionName<<"(ctx, ";
1250*7d8d0e25Snbeams   if (dim != 3 || useCollograd) {
1251*7d8d0e25Snbeams     code << "1";
1252*7d8d0e25Snbeams   } else {
1253*7d8d0e25Snbeams     code << "Q1d";
1254*7d8d0e25Snbeams   }
1255*7d8d0e25Snbeams   code << ", in, out);\n";
1256*7d8d0e25Snbeams   if (useCollograd) {
1257*7d8d0e25Snbeams     code << "\n      // Note: Collocated Gradient\n";
1258*7d8d0e25Snbeams     code << "      // -- Output fields --\n";
1259*7d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
1260*7d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
1261*7d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1262*7d8d0e25Snbeams       CeedChk(ierr);
1263*7d8d0e25Snbeams       // Basis action
1264*7d8d0e25Snbeams       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1265*7d8d0e25Snbeams       switch (emode) {
1266*7d8d0e25Snbeams       case CEED_EVAL_NONE:
1267*7d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1268*7d8d0e25Snbeams         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1269*7d8d0e25Snbeams         code << "      }\n";
1270*7d8d0e25Snbeams         break; // No action
1271*7d8d0e25Snbeams       case CEED_EVAL_INTERP:
1272*7d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
1273*7d8d0e25Snbeams         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
1274*7d8d0e25Snbeams         code << "      }\n";
1275*7d8d0e25Snbeams         break;
1276*7d8d0e25Snbeams       case CEED_EVAL_GRAD:
1277*7d8d0e25Snbeams         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
1278*7d8d0e25Snbeams         break;
1279*7d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
1280*7d8d0e25Snbeams         break; // Should not occur
1281*7d8d0e25Snbeams       case CEED_EVAL_DIV:
1282*7d8d0e25Snbeams         break; // TODO: Not implemented
1283*7d8d0e25Snbeams       case CEED_EVAL_CURL:
1284*7d8d0e25Snbeams         break; // TODO: Not implemented
1285*7d8d0e25Snbeams       }
1286*7d8d0e25Snbeams     }
1287*7d8d0e25Snbeams     code << "    }\n";
1288*7d8d0e25Snbeams   }
1289*7d8d0e25Snbeams 
1290*7d8d0e25Snbeams   // Output basis apply if needed
1291*7d8d0e25Snbeams   // Generate the correct eval mode code for each output
1292*7d8d0e25Snbeams   code << "\n    // -- Output field basis action and restrictions --\n";
1293*7d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
1294*7d8d0e25Snbeams     code << "    // ---- Output field "<<i<<" ----\n";
1295*7d8d0e25Snbeams     // Get elemsize, emode, ncomp
1296*7d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1297*7d8d0e25Snbeams     CeedChk(ierr);
1298*7d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1299*7d8d0e25Snbeams     CeedChk(ierr);
1300*7d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1301*7d8d0e25Snbeams     CeedChk(ierr);
1302*7d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1303*7d8d0e25Snbeams     CeedChk(ierr);
1304*7d8d0e25Snbeams     // Basis action
1305*7d8d0e25Snbeams     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
1306*7d8d0e25Snbeams     switch (emode) {
1307*7d8d0e25Snbeams     case CEED_EVAL_NONE:
1308*7d8d0e25Snbeams       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
1309*7d8d0e25Snbeams       break; // No action
1310*7d8d0e25Snbeams     case CEED_EVAL_INTERP:
1311*7d8d0e25Snbeams       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1312*7d8d0e25Snbeams       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1313*7d8d0e25Snbeams       break;
1314*7d8d0e25Snbeams     case CEED_EVAL_GRAD:
1315*7d8d0e25Snbeams       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
1316*7d8d0e25Snbeams       if (useCollograd) {
1317*7d8d0e25Snbeams         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
1318*7d8d0e25Snbeams       } else {
1319*7d8d0e25Snbeams         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
1320*7d8d0e25Snbeams       }
1321*7d8d0e25Snbeams       break;
1322*7d8d0e25Snbeams     // LCOV_EXCL_START
1323*7d8d0e25Snbeams     case CEED_EVAL_WEIGHT: {
1324*7d8d0e25Snbeams       Ceed ceed;
1325*7d8d0e25Snbeams       ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
1326*7d8d0e25Snbeams       return CeedError(ceed, 1,
1327*7d8d0e25Snbeams                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
1328*7d8d0e25Snbeams       break; // Should not occur
1329*7d8d0e25Snbeams     }
1330*7d8d0e25Snbeams     case CEED_EVAL_DIV:
1331*7d8d0e25Snbeams       break; // TODO: Not implemented
1332*7d8d0e25Snbeams     case CEED_EVAL_CURL:
1333*7d8d0e25Snbeams       break; // TODO: Not implemented
1334*7d8d0e25Snbeams       // LCOV_EXCL_STOP
1335*7d8d0e25Snbeams     }
1336*7d8d0e25Snbeams     // Restriction
1337*7d8d0e25Snbeams       bool isStrided;
1338*7d8d0e25Snbeams       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr);
1339*7d8d0e25Snbeams     if (!isStrided) {
1340*7d8d0e25Snbeams       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1341*7d8d0e25Snbeams       CeedChk(ierr);
1342*7d8d0e25Snbeams       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
1343*7d8d0e25Snbeams       CeedInt compstride;
1344*7d8d0e25Snbeams       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr);
1345*7d8d0e25Snbeams       code << "    // CompStride: "<<compstride<<"\n";
1346*7d8d0e25Snbeams       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr);
1347*7d8d0e25Snbeams       data->indices.out[i] = restr_data->d_ind;
1348*7d8d0e25Snbeams       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
1349*7d8d0e25Snbeams     } else {
1350*7d8d0e25Snbeams       bool backendstrides;
1351*7d8d0e25Snbeams       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1352*7d8d0e25Snbeams       CeedChk(ierr);
1353*7d8d0e25Snbeams       CeedInt nelem;
1354*7d8d0e25Snbeams       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1355*7d8d0e25Snbeams       CeedChk(ierr);
1356*7d8d0e25Snbeams       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
1357*7d8d0e25Snbeams       if (!backendstrides) {
1358*7d8d0e25Snbeams         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1359*7d8d0e25Snbeams         CeedChk(ierr);
1360*7d8d0e25Snbeams       }
1361*7d8d0e25Snbeams       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
1362*7d8d0e25Snbeams       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
1363*7d8d0e25Snbeams     }
1364*7d8d0e25Snbeams   }
1365*7d8d0e25Snbeams 
1366*7d8d0e25Snbeams   code << "  }\n";
1367*7d8d0e25Snbeams   code << "}\n";
1368*7d8d0e25Snbeams   code << "// -----------------------------------------------------------------------------\n\n";
1369*7d8d0e25Snbeams 
1370*7d8d0e25Snbeams   // View kernel for debugging
1371*7d8d0e25Snbeams   CeedDebug(code.str().c_str());
1372*7d8d0e25Snbeams 
1373*7d8d0e25Snbeams   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 1,
1374*7d8d0e25Snbeams                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1375*7d8d0e25Snbeams   CeedChk(ierr);
1376*7d8d0e25Snbeams   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1377*7d8d0e25Snbeams   CeedChk(ierr);
1378*7d8d0e25Snbeams 
1379*7d8d0e25Snbeams   ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
1380*7d8d0e25Snbeams   return 0;
1381*7d8d0e25Snbeams }
1382*7d8d0e25Snbeams //------------------------------------------------------------------------------
1383