xref: /libCEED/rust/libceed-sys/c-src/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 3d8e882215d238700cdceb37404f76ca7fa24eaa)
1*3d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2*3d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
37d8d0e25Snbeams //
4*3d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
57d8d0e25Snbeams //
6*3d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
73d576824SJeremy L Thompson 
87d8d0e25Snbeams #define CEED_DEBUG_COLOR 12
97d8d0e25Snbeams 
10ec3da8bcSJed Brown #include <ceed/ceed.h>
11ec3da8bcSJed Brown #include <ceed/backend.h>
127d8d0e25Snbeams #include <iostream>
133d576824SJeremy L Thompson #include <string>
147d8d0e25Snbeams #include <sstream>
153d576824SJeremy L Thompson #include "ceed-hip-gen.h"
160d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h"
177d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h"
187d8d0e25Snbeams #include "../hip/ceed-hip-compile.h"
197d8d0e25Snbeams 
20b3e1519bSnbeams 
21b3e1519bSnbeams //------------------------------------------------------------------------------
22b3e1519bSnbeams // Calculate the block size used for launching the operator kernel
23b3e1519bSnbeams //------------------------------------------------------------------------------
2437c3b1cfSnbeams extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt nelem,
25b3e1519bSnbeams      	                                  const CeedInt P1d, const CeedInt Q1d,
26b3e1519bSnbeams 				          CeedInt *block_sizes) {
27b3e1519bSnbeams 
28b3e1519bSnbeams   const CeedInt thread1d = CeedIntMax(Q1d, P1d);
29b3e1519bSnbeams   if (dim==1) {
30b3e1519bSnbeams     CeedInt elemsPerBlock = 64*thread1d > 256? 256/thread1d : 64;
31b3e1519bSnbeams     elemsPerBlock = elemsPerBlock>0?elemsPerBlock:1;
32b3e1519bSnbeams     block_sizes[0] = thread1d;
33b3e1519bSnbeams     block_sizes[1] = 1;
34b3e1519bSnbeams     block_sizes[2] = elemsPerBlock;
35b3e1519bSnbeams   } else if (dim==2) {
36b3e1519bSnbeams     const CeedInt elemsPerBlock = thread1d<4? 16 : 2;
37b3e1519bSnbeams     block_sizes[0] = thread1d;
38b3e1519bSnbeams     block_sizes[1] = thread1d;
39b3e1519bSnbeams     block_sizes[2] = elemsPerBlock;
40b3e1519bSnbeams   } else if (dim==3) {
41b3e1519bSnbeams     const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1);
42b3e1519bSnbeams     block_sizes[0] = thread1d;
43b3e1519bSnbeams     block_sizes[1] = thread1d;
44b3e1519bSnbeams     block_sizes[2] = elemsPerBlock;
45b3e1519bSnbeams   }
46b3e1519bSnbeams   return CEED_ERROR_SUCCESS;
47b3e1519bSnbeams }
48b3e1519bSnbeams 
497d8d0e25Snbeams static const char *deviceFunctions = QUOTE(
507d8d0e25Snbeams 
517d8d0e25Snbeams //------------------------------------------------------------------------------
527d8d0e25Snbeams // Typedefs
537d8d0e25Snbeams //------------------------------------------------------------------------------
547d8d0e25Snbeams typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
557d8d0e25Snbeams typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
567d8d0e25Snbeams 
577d8d0e25Snbeams typedef struct {
587d8d0e25Snbeams   CeedInt tidx;
597d8d0e25Snbeams   CeedInt tidy;
607d8d0e25Snbeams   CeedInt tidz;
617d8d0e25Snbeams   CeedInt tid;
627d8d0e25Snbeams   CeedScalar* slice;
637d8d0e25Snbeams } BackendData;
647d8d0e25Snbeams 
657d8d0e25Snbeams //------------------------------------------------------------------------------
667d8d0e25Snbeams // Load matrices for basis actions
677d8d0e25Snbeams //------------------------------------------------------------------------------
687d8d0e25Snbeams template <int P, int Q>
697d8d0e25Snbeams inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
707d8d0e25Snbeams   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
717d8d0e25Snbeams     B[i] = d_B[i];
727d8d0e25Snbeams }
737d8d0e25Snbeams 
747d8d0e25Snbeams //------------------------------------------------------------------------------
757d8d0e25Snbeams // 1D
767d8d0e25Snbeams //------------------------------------------------------------------------------
777d8d0e25Snbeams 
787d8d0e25Snbeams //------------------------------------------------------------------------------
797d8d0e25Snbeams // L-vector -> E-vector, offsets provided
807d8d0e25Snbeams //------------------------------------------------------------------------------
817d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
827d8d0e25Snbeams inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
837d8d0e25Snbeams   if (data.tidx < P1d) {
847d8d0e25Snbeams     const CeedInt node = data.tidx;
857d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d];
867d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
877d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
887d8d0e25Snbeams   }
897d8d0e25Snbeams }
907d8d0e25Snbeams 
917d8d0e25Snbeams //------------------------------------------------------------------------------
927d8d0e25Snbeams // L-vector -> E-vector, strided
937d8d0e25Snbeams //------------------------------------------------------------------------------
947d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
957d8d0e25Snbeams inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
967d8d0e25Snbeams   if (data.tidx < P1d) {
977d8d0e25Snbeams     const CeedInt node = data.tidx;
987d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
997d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1007d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
1017d8d0e25Snbeams   }
1027d8d0e25Snbeams }
1037d8d0e25Snbeams 
1047d8d0e25Snbeams //------------------------------------------------------------------------------
1057d8d0e25Snbeams // E-vector -> L-vector, offsets provided
1067d8d0e25Snbeams //------------------------------------------------------------------------------
1077d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
1087d8d0e25Snbeams inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
1097d8d0e25Snbeams   if (data.tidx < P1d) {
1107d8d0e25Snbeams     const CeedInt node = data.tidx;
1117d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d];
1127d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1137d8d0e25Snbeams       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
1147d8d0e25Snbeams   }
1157d8d0e25Snbeams }
1167d8d0e25Snbeams 
1177d8d0e25Snbeams //------------------------------------------------------------------------------
1187d8d0e25Snbeams // E-vector -> L-vector, strided
1197d8d0e25Snbeams //------------------------------------------------------------------------------
1207d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
1217d8d0e25Snbeams inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
1227d8d0e25Snbeams   if (data.tidx < P1d) {
1237d8d0e25Snbeams     const CeedInt node = data.tidx;
1247d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
1257d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1267d8d0e25Snbeams       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
1277d8d0e25Snbeams   }
1287d8d0e25Snbeams }
1297d8d0e25Snbeams 
1307d8d0e25Snbeams //------------------------------------------------------------------------------
1317d8d0e25Snbeams // 1D tensor contraction x
1327d8d0e25Snbeams //------------------------------------------------------------------------------
1337d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1347d8d0e25Snbeams inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1357d8d0e25Snbeams   data.slice[data.tidx] = *U;
1367d8d0e25Snbeams   __syncthreads();
1377d8d0e25Snbeams   *V = 0.0;
1387d8d0e25Snbeams   if (data.tidx < Q1d)
1397d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
1407d8d0e25Snbeams       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
1417d8d0e25Snbeams   __syncthreads();
1427d8d0e25Snbeams }
1437d8d0e25Snbeams 
1447d8d0e25Snbeams //------------------------------------------------------------------------------
1457d8d0e25Snbeams // 1D transpose tensor contraction x
1467d8d0e25Snbeams //------------------------------------------------------------------------------
1477d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1487d8d0e25Snbeams inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1497d8d0e25Snbeams   data.slice[data.tidx] = *U;
1507d8d0e25Snbeams   __syncthreads();
1517d8d0e25Snbeams   *V = 0.0;
1527d8d0e25Snbeams   if (data.tidx < P1d)
1537d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
1547d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
1557d8d0e25Snbeams   __syncthreads();
1567d8d0e25Snbeams }
1577d8d0e25Snbeams 
1587d8d0e25Snbeams //------------------------------------------------------------------------------
1597d8d0e25Snbeams // 1D interpolate to quadrature points
1607d8d0e25Snbeams //------------------------------------------------------------------------------
1617d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1627d8d0e25Snbeams inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
1637d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
1647d8d0e25Snbeams     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
1657d8d0e25Snbeams }
1667d8d0e25Snbeams 
1677d8d0e25Snbeams //------------------------------------------------------------------------------
1687d8d0e25Snbeams // 1D interpolate transpose
1697d8d0e25Snbeams //------------------------------------------------------------------------------
1707d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1717d8d0e25Snbeams inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
1727d8d0e25Snbeams   for (CeedInt comp=0; comp<NCOMP; comp++)
1737d8d0e25Snbeams     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
1747d8d0e25Snbeams }
1757d8d0e25Snbeams 
1767d8d0e25Snbeams //------------------------------------------------------------------------------
1777d8d0e25Snbeams // 1D derivatives at quadrature points
1787d8d0e25Snbeams //------------------------------------------------------------------------------
1797d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1807d8d0e25Snbeams inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
1817d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
1827d8d0e25Snbeams     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
1837d8d0e25Snbeams }
1847d8d0e25Snbeams 
1857d8d0e25Snbeams //------------------------------------------------------------------------------
1867d8d0e25Snbeams // 1D derivatives transpose
1877d8d0e25Snbeams //------------------------------------------------------------------------------
1887d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1897d8d0e25Snbeams inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
1907d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
1917d8d0e25Snbeams     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
1927d8d0e25Snbeams }
1937d8d0e25Snbeams 
1947d8d0e25Snbeams //------------------------------------------------------------------------------
1957d8d0e25Snbeams // 2D
1967d8d0e25Snbeams //------------------------------------------------------------------------------
1977d8d0e25Snbeams 
1987d8d0e25Snbeams //------------------------------------------------------------------------------
1997d8d0e25Snbeams // L-vector -> E-vector, offsets provided
2007d8d0e25Snbeams //------------------------------------------------------------------------------
2017d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
2027d8d0e25Snbeams inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
2037d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2047d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2057d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d*P1d];
2067d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2077d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
2087d8d0e25Snbeams   }
2097d8d0e25Snbeams }
2107d8d0e25Snbeams 
2117d8d0e25Snbeams //------------------------------------------------------------------------------
2127d8d0e25Snbeams // L-vector -> E-vector, strided
2137d8d0e25Snbeams //------------------------------------------------------------------------------
2147d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
2157d8d0e25Snbeams inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
2167d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2177d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2187d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
2197d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2207d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
2217d8d0e25Snbeams   }
2227d8d0e25Snbeams }
2237d8d0e25Snbeams 
2247d8d0e25Snbeams //------------------------------------------------------------------------------
2257d8d0e25Snbeams // E-vector -> L-vector, offsets provided
2267d8d0e25Snbeams //------------------------------------------------------------------------------
2277d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
2287d8d0e25Snbeams inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
2297d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2307d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2317d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d*P1d];
2327d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2337d8d0e25Snbeams       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
2347d8d0e25Snbeams   }
2357d8d0e25Snbeams }
2367d8d0e25Snbeams 
2377d8d0e25Snbeams //------------------------------------------------------------------------------
2387d8d0e25Snbeams // E-vector -> L-vector, strided
2397d8d0e25Snbeams //------------------------------------------------------------------------------
2407d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
2417d8d0e25Snbeams inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
2427d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2437d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2447d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
2457d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2467d8d0e25Snbeams       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
2477d8d0e25Snbeams   }
2487d8d0e25Snbeams }
2497d8d0e25Snbeams 
2507d8d0e25Snbeams //------------------------------------------------------------------------------
2517d8d0e25Snbeams // 2D tensor contraction x
2527d8d0e25Snbeams //------------------------------------------------------------------------------
2537d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2547d8d0e25Snbeams inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2557d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2567d8d0e25Snbeams   __syncthreads();
2577d8d0e25Snbeams   *V = 0.0;
2587d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < P1d)
2597d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
2607d8d0e25Snbeams       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
2617d8d0e25Snbeams   __syncthreads();
2627d8d0e25Snbeams }
2637d8d0e25Snbeams 
2647d8d0e25Snbeams //------------------------------------------------------------------------------
2657d8d0e25Snbeams // 2D tensor contract y
2667d8d0e25Snbeams //------------------------------------------------------------------------------
2677d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2687d8d0e25Snbeams inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2697d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2707d8d0e25Snbeams   __syncthreads();
2717d8d0e25Snbeams   *V = 0.0;
2727d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d)
2737d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
2747d8d0e25Snbeams       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
2757d8d0e25Snbeams   __syncthreads();
2767d8d0e25Snbeams }
2777d8d0e25Snbeams 
2787d8d0e25Snbeams //------------------------------------------------------------------------------
2797d8d0e25Snbeams // 2D transpose tensor contract y
2807d8d0e25Snbeams //------------------------------------------------------------------------------
2817d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2827d8d0e25Snbeams inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2837d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2847d8d0e25Snbeams   __syncthreads();
2857d8d0e25Snbeams   *V = 0.0;
2867d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < P1d)
2877d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
2887d8d0e25Snbeams       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
2897d8d0e25Snbeams   __syncthreads();
2907d8d0e25Snbeams }
2917d8d0e25Snbeams 
2927d8d0e25Snbeams //------------------------------------------------------------------------------
2937d8d0e25Snbeams // 2D transpose tensor contract x
2947d8d0e25Snbeams //------------------------------------------------------------------------------
2957d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2967d8d0e25Snbeams inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2977d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2987d8d0e25Snbeams   __syncthreads();
2997d8d0e25Snbeams   *V = 0.0;
3007d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3017d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
3027d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
3037d8d0e25Snbeams   __syncthreads();
3047d8d0e25Snbeams }
3057d8d0e25Snbeams 
3067d8d0e25Snbeams //------------------------------------------------------------------------------
3077d8d0e25Snbeams // 2D transpose tensor contract and add x
3087d8d0e25Snbeams //------------------------------------------------------------------------------
3097d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3107d8d0e25Snbeams inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3117d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
3127d8d0e25Snbeams   __syncthreads();
3137d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3147d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
3157d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
3167d8d0e25Snbeams   __syncthreads();
3177d8d0e25Snbeams }
3187d8d0e25Snbeams 
3197d8d0e25Snbeams //------------------------------------------------------------------------------
3207d8d0e25Snbeams // 2D interpolate to quadrature points
3217d8d0e25Snbeams //------------------------------------------------------------------------------
3227d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3237d8d0e25Snbeams inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
3247d8d0e25Snbeams   CeedScalar r_t[1];
3257d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3267d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3277d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3287d8d0e25Snbeams   }
3297d8d0e25Snbeams }
3307d8d0e25Snbeams 
3317d8d0e25Snbeams //------------------------------------------------------------------------------
3327d8d0e25Snbeams // 2D interpolate transpose
3337d8d0e25Snbeams //------------------------------------------------------------------------------
3347d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3357d8d0e25Snbeams inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
3367d8d0e25Snbeams   CeedScalar r_t[1];
3377d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3387d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3397d8d0e25Snbeams     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3407d8d0e25Snbeams   }
3417d8d0e25Snbeams }
3427d8d0e25Snbeams 
3437d8d0e25Snbeams //------------------------------------------------------------------------------
3447d8d0e25Snbeams // 2D derivatives at quadrature points
3457d8d0e25Snbeams //------------------------------------------------------------------------------
3467d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3477d8d0e25Snbeams inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
3487d8d0e25Snbeams   CeedScalar r_t[1];
3497d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3507d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
3517d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
3527d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3537d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
3547d8d0e25Snbeams   }
3557d8d0e25Snbeams }
3567d8d0e25Snbeams 
3577d8d0e25Snbeams //------------------------------------------------------------------------------
3587d8d0e25Snbeams // 2D derivatives transpose
3597d8d0e25Snbeams //------------------------------------------------------------------------------
3607d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3617d8d0e25Snbeams inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
3627d8d0e25Snbeams   CeedScalar r_t[1];
3637d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3647d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
3657d8d0e25Snbeams     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
3667d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
3677d8d0e25Snbeams     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3687d8d0e25Snbeams   }
3697d8d0e25Snbeams }
3707d8d0e25Snbeams 
3717d8d0e25Snbeams //------------------------------------------------------------------------------
3727d8d0e25Snbeams // 3D
3737d8d0e25Snbeams //------------------------------------------------------------------------------
3747d8d0e25Snbeams 
3757d8d0e25Snbeams //------------------------------------------------------------------------------
3767d8d0e25Snbeams // L-vector -> E-vector, offsets provided
3777d8d0e25Snbeams //------------------------------------------------------------------------------
3787d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
3797d8d0e25Snbeams inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
3807d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3817d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
3827d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
3837d8d0e25Snbeams       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
3847d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3857d8d0e25Snbeams         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
3867d8d0e25Snbeams     }
3877d8d0e25Snbeams }
3887d8d0e25Snbeams 
3897d8d0e25Snbeams //------------------------------------------------------------------------------
3907d8d0e25Snbeams // L-vector -> E-vector, strided
3917d8d0e25Snbeams //------------------------------------------------------------------------------
3927d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
3937d8d0e25Snbeams inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
3947d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3957d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
3967d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
3977d8d0e25Snbeams       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
3987d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3997d8d0e25Snbeams         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
4007d8d0e25Snbeams     }
4017d8d0e25Snbeams }
4027d8d0e25Snbeams 
4037d8d0e25Snbeams //------------------------------------------------------------------------------
4047d8d0e25Snbeams // E-vector -> Q-vector, offests provided
4057d8d0e25Snbeams //------------------------------------------------------------------------------
4067d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int Q1d>
4077d8d0e25Snbeams inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
4087d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
4097d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
4107d8d0e25Snbeams     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
4117d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4127d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
4137d8d0e25Snbeams   }
4147d8d0e25Snbeams }
4157d8d0e25Snbeams 
4167d8d0e25Snbeams //------------------------------------------------------------------------------
4177d8d0e25Snbeams // E-vector -> Q-vector, strided
4187d8d0e25Snbeams //------------------------------------------------------------------------------
4197d8d0e25Snbeams template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4207d8d0e25Snbeams inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
4217d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
4227d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
4237d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
4247d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4257d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
4267d8d0e25Snbeams   }
4277d8d0e25Snbeams }
4287d8d0e25Snbeams 
4297d8d0e25Snbeams //------------------------------------------------------------------------------
4307d8d0e25Snbeams // E-vector -> L-vector, offsets provided
4317d8d0e25Snbeams //------------------------------------------------------------------------------
4327d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
4337d8d0e25Snbeams inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
4347d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
4357d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
4367d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4377d8d0e25Snbeams       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
4387d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4397d8d0e25Snbeams         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
4407d8d0e25Snbeams     }
4417d8d0e25Snbeams }
4427d8d0e25Snbeams 
4437d8d0e25Snbeams //------------------------------------------------------------------------------
4447d8d0e25Snbeams // E-vector -> L-vector, strided
4457d8d0e25Snbeams //------------------------------------------------------------------------------
4467d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4477d8d0e25Snbeams inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
4487d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
4497d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
4507d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4517d8d0e25Snbeams       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
4527d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4537d8d0e25Snbeams         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
4547d8d0e25Snbeams     }
4557d8d0e25Snbeams }
4567d8d0e25Snbeams 
4577d8d0e25Snbeams //------------------------------------------------------------------------------
4587d8d0e25Snbeams // 3D tensor contract x
4597d8d0e25Snbeams //------------------------------------------------------------------------------
4607d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
4617d8d0e25Snbeams inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4627d8d0e25Snbeams   CeedScalar r_B[P1d];
4637d8d0e25Snbeams   for (CeedInt i = 0; i < P1d; ++i)
4647d8d0e25Snbeams     r_B[i] = B[i + data.tidx*P1d];
4657d8d0e25Snbeams 
4667d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
4677d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
4687d8d0e25Snbeams     __syncthreads();
4697d8d0e25Snbeams     V[k] = 0.0;
4707d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
4717d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
4727d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
4737d8d0e25Snbeams     __syncthreads();
4747d8d0e25Snbeams   }
4757d8d0e25Snbeams }
4767d8d0e25Snbeams 
4777d8d0e25Snbeams //------------------------------------------------------------------------------
4787d8d0e25Snbeams // 3D tensor contract y
4797d8d0e25Snbeams //------------------------------------------------------------------------------
4807d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
4817d8d0e25Snbeams inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4827d8d0e25Snbeams   CeedScalar r_B[P1d];
4837d8d0e25Snbeams   for (CeedInt i = 0; i < P1d; ++i)
4847d8d0e25Snbeams     r_B[i] = B[i + data.tidy*P1d];
4857d8d0e25Snbeams 
4867d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
4877d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
4887d8d0e25Snbeams     __syncthreads();
4897d8d0e25Snbeams     V[k] = 0.0;
4907d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
4917d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
4927d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
4937d8d0e25Snbeams     __syncthreads();
4947d8d0e25Snbeams   }
4957d8d0e25Snbeams }
4967d8d0e25Snbeams 
4977d8d0e25Snbeams //------------------------------------------------------------------------------
4987d8d0e25Snbeams // 3D tensor contract z
4997d8d0e25Snbeams //------------------------------------------------------------------------------
5007d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5017d8d0e25Snbeams inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5027d8d0e25Snbeams   for (CeedInt k = 0; k < Q1d; ++k) {
5037d8d0e25Snbeams     V[k] = 0.0;
5047d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
5057d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
5067d8d0e25Snbeams         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
5077d8d0e25Snbeams   }
5087d8d0e25Snbeams }
5097d8d0e25Snbeams 
5107d8d0e25Snbeams //------------------------------------------------------------------------------
5117d8d0e25Snbeams // 3D transpose tensor contract z
5127d8d0e25Snbeams //------------------------------------------------------------------------------
5137d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5147d8d0e25Snbeams inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5157d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5167d8d0e25Snbeams     V[k] = 0.0;
5177d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
5187d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5197d8d0e25Snbeams         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
5207d8d0e25Snbeams   }
5217d8d0e25Snbeams }
5227d8d0e25Snbeams 
5237d8d0e25Snbeams //------------------------------------------------------------------------------
5247d8d0e25Snbeams // 3D transpose tensor contract y
5257d8d0e25Snbeams //------------------------------------------------------------------------------
5267d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5277d8d0e25Snbeams inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5287d8d0e25Snbeams   CeedScalar r_B[Q1d];
5297d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5307d8d0e25Snbeams     r_B[i] = B[data.tidy + i*P1d];
5317d8d0e25Snbeams 
5327d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5337d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5347d8d0e25Snbeams     __syncthreads();
5357d8d0e25Snbeams     V[k] = 0.0;
5367d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
5377d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5387d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
5397d8d0e25Snbeams     __syncthreads();
5407d8d0e25Snbeams   }
5417d8d0e25Snbeams }
5427d8d0e25Snbeams 
5437d8d0e25Snbeams //------------------------------------------------------------------------------
5447d8d0e25Snbeams // 3D transpose tensor contract add y
5457d8d0e25Snbeams //------------------------------------------------------------------------------
5467d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5477d8d0e25Snbeams inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5487d8d0e25Snbeams   CeedScalar r_B[Q1d];
5497d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5507d8d0e25Snbeams     r_B[i] = B[data.tidy + i*P1d];
5517d8d0e25Snbeams 
5527d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5537d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5547d8d0e25Snbeams     __syncthreads();
5557d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
5567d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5577d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
5587d8d0e25Snbeams     __syncthreads();
5597d8d0e25Snbeams   }
5607d8d0e25Snbeams }
5617d8d0e25Snbeams 
5627d8d0e25Snbeams //------------------------------------------------------------------------------
5637d8d0e25Snbeams // 3D transpose tensor contract x
5647d8d0e25Snbeams //------------------------------------------------------------------------------
5657d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5667d8d0e25Snbeams inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5677d8d0e25Snbeams   CeedScalar r_B[Q1d];
5687d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5697d8d0e25Snbeams     r_B[i] = B[data.tidx + i*P1d];
5707d8d0e25Snbeams 
5717d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5727d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5737d8d0e25Snbeams     __syncthreads();
5747d8d0e25Snbeams     V[k] = 0.0;
5757d8d0e25Snbeams     if (data.tidx < P1d && data.tidy < P1d)
5767d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5777d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
5787d8d0e25Snbeams     __syncthreads();
5797d8d0e25Snbeams   }
5807d8d0e25Snbeams }
5817d8d0e25Snbeams 
5827d8d0e25Snbeams //------------------------------------------------------------------------------
5837d8d0e25Snbeams // 3D transpose tensor contract add x
5847d8d0e25Snbeams //------------------------------------------------------------------------------
5857d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5867d8d0e25Snbeams inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5877d8d0e25Snbeams   CeedScalar r_B[Q1d];
5887d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5897d8d0e25Snbeams     r_B[i] = B[data.tidx + i*P1d];
5907d8d0e25Snbeams 
5917d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5927d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5937d8d0e25Snbeams     __syncthreads();
5947d8d0e25Snbeams     if (data.tidx < P1d && data.tidy < P1d)
5957d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5967d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
5977d8d0e25Snbeams     __syncthreads();
5987d8d0e25Snbeams   }
5997d8d0e25Snbeams }
6007d8d0e25Snbeams 
6017d8d0e25Snbeams //------------------------------------------------------------------------------
6027d8d0e25Snbeams // 3D interpolate to quadrature points
6037d8d0e25Snbeams //------------------------------------------------------------------------------
6047d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6057d8d0e25Snbeams inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
6067d8d0e25Snbeams   CeedScalar r_t1[T1d];
6077d8d0e25Snbeams   CeedScalar r_t2[T1d];
6087d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6097d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
6107d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6117d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
6127d8d0e25Snbeams   }
6137d8d0e25Snbeams }
6147d8d0e25Snbeams 
6157d8d0e25Snbeams //------------------------------------------------------------------------------
6167d8d0e25Snbeams // 3D interpolate transpose
6177d8d0e25Snbeams //------------------------------------------------------------------------------
6187d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6197d8d0e25Snbeams inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
6207d8d0e25Snbeams   CeedScalar r_t1[T1d];
6217d8d0e25Snbeams   CeedScalar r_t2[T1d];
6227d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6237d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
6247d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6257d8d0e25Snbeams     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6267d8d0e25Snbeams   }
6277d8d0e25Snbeams }
6287d8d0e25Snbeams 
6297d8d0e25Snbeams //------------------------------------------------------------------------------
6307d8d0e25Snbeams // 3D derivatives at quadrature points
6317d8d0e25Snbeams //------------------------------------------------------------------------------
6327d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6337d8d0e25Snbeams inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6347d8d0e25Snbeams   CeedScalar r_t1[T1d];
6357d8d0e25Snbeams   CeedScalar r_t2[T1d];
6367d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6377d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
6387d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6397d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
6407d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
6417d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
6427d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
6437d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
6447d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6457d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
6467d8d0e25Snbeams   }
6477d8d0e25Snbeams }
6487d8d0e25Snbeams 
6497d8d0e25Snbeams //------------------------------------------------------------------------------
6507d8d0e25Snbeams // 3D derivatives transpose
6517d8d0e25Snbeams //------------------------------------------------------------------------------
6527d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6537d8d0e25Snbeams inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6547d8d0e25Snbeams   CeedScalar r_t1[T1d];
6557d8d0e25Snbeams   CeedScalar r_t2[T1d];
6567d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6577d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
6587d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6597d8d0e25Snbeams     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
6607d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
6617d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
6627d8d0e25Snbeams     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6637d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
6647d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6657d8d0e25Snbeams     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6667d8d0e25Snbeams   }
6677d8d0e25Snbeams }
6687d8d0e25Snbeams 
6697d8d0e25Snbeams //------------------------------------------------------------------------------
6707d8d0e25Snbeams // 3D collocated derivatives computation
6717d8d0e25Snbeams //------------------------------------------------------------------------------
6727d8d0e25Snbeams template <int NCOMP, int Q1d>
6737d8d0e25Snbeams inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6747d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
6757d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
6767d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
6777d8d0e25Snbeams       __syncthreads();
6787d8d0e25Snbeams       // X derivative
6797d8d0e25Snbeams       r_V[comp+0*NCOMP] = 0.0;
6807d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6817d8d0e25Snbeams         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
6827d8d0e25Snbeams       // Y derivative
6837d8d0e25Snbeams       r_V[comp+1*NCOMP] = 0.0;
6847d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6857d8d0e25Snbeams         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
6867d8d0e25Snbeams       // Z derivative
6877d8d0e25Snbeams       r_V[comp+2*NCOMP] = 0.0;
6887d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6897d8d0e25Snbeams         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
6907d8d0e25Snbeams       __syncthreads();
6917d8d0e25Snbeams     }
6927d8d0e25Snbeams   }
6937d8d0e25Snbeams }
6947d8d0e25Snbeams 
6957d8d0e25Snbeams //------------------------------------------------------------------------------
6967d8d0e25Snbeams // 3D collocated derivatives transpose
6977d8d0e25Snbeams //------------------------------------------------------------------------------
6987d8d0e25Snbeams template <int NCOMP, int Q1d>
6997d8d0e25Snbeams inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
7007d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
7017d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
7027d8d0e25Snbeams       // X derivative
7037d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
7047d8d0e25Snbeams       __syncthreads();
7057d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
7067d8d0e25Snbeams         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
7077d8d0e25Snbeams       __syncthreads();
7087d8d0e25Snbeams       // Y derivative
7097d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
7107d8d0e25Snbeams       __syncthreads();
7117d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
7127d8d0e25Snbeams         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
7137d8d0e25Snbeams       __syncthreads();
7147d8d0e25Snbeams       // Z derivative
7157d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
7167d8d0e25Snbeams         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
7177d8d0e25Snbeams     }
7187d8d0e25Snbeams   }
7197d8d0e25Snbeams }
7207d8d0e25Snbeams 
7217d8d0e25Snbeams //------------------------------------------------------------------------------
7227d8d0e25Snbeams // 1D quadrature weights
7237d8d0e25Snbeams //------------------------------------------------------------------------------
7247d8d0e25Snbeams template <int Q1d>
7257d8d0e25Snbeams inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7267d8d0e25Snbeams   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
7277d8d0e25Snbeams }
7287d8d0e25Snbeams 
7297d8d0e25Snbeams //------------------------------------------------------------------------------
7307d8d0e25Snbeams // 2D quadrature weights
7317d8d0e25Snbeams //------------------------------------------------------------------------------
7327d8d0e25Snbeams template <int Q1d>
7337d8d0e25Snbeams inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7347d8d0e25Snbeams   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
7357d8d0e25Snbeams         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
7367d8d0e25Snbeams }
7377d8d0e25Snbeams 
7387d8d0e25Snbeams //------------------------------------------------------------------------------
7397d8d0e25Snbeams // 3D quadrature weights
7407d8d0e25Snbeams //------------------------------------------------------------------------------
7417d8d0e25Snbeams template <int Q1d>
7427d8d0e25Snbeams inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7437d8d0e25Snbeams   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
7447d8d0e25Snbeams   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
7457d8d0e25Snbeams   for (CeedInt z = 0; z < Q1d; ++z)
7467d8d0e25Snbeams     w[z] = quad ? pw*qweight1d[z] : 0.0;
7477d8d0e25Snbeams }
7487d8d0e25Snbeams 
7497d8d0e25Snbeams );
7507d8d0e25Snbeams //------------------------------------------------------------------------------
7517d8d0e25Snbeams // Build singe operator kernel
7527d8d0e25Snbeams //------------------------------------------------------------------------------
75337c3b1cfSnbeams extern "C" int CeedHipGenOperatorBuild(CeedOperator op) {
7547d8d0e25Snbeams 
7557d8d0e25Snbeams   using std::ostringstream;
7567d8d0e25Snbeams   using std::string;
7577d8d0e25Snbeams   int ierr;
7587d8d0e25Snbeams   bool setupdone;
759e15f9bd0SJeremy L Thompson   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
760e15f9bd0SJeremy L Thompson   if (setupdone) return CEED_ERROR_SUCCESS;
7617d8d0e25Snbeams   Ceed ceed;
762e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
7637d8d0e25Snbeams   CeedOperator_Hip_gen *data;
764e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
7657d8d0e25Snbeams   CeedQFunction qf;
7667d8d0e25Snbeams   CeedQFunction_Hip_gen *qf_data;
767e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
768e15f9bd0SJeremy L Thompson   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
769e79b91d9SJeremy L Thompson   CeedSize lsize;
77078464608Sjeremylt   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
771e79b91d9SJeremy L Thompson           numoutputfields, ncomp, dim = 0;
772e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
773e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
7747d8d0e25Snbeams   CeedOperatorField *opinputfields, *opoutputfields;
7757e7773b5SJeremy L Thompson   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
776e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
7777d8d0e25Snbeams   CeedQFunctionField *qfinputfields, *qfoutputfields;
7787e7773b5SJeremy L Thompson   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
779e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
7807d8d0e25Snbeams   CeedEvalMode emode;
7817d8d0e25Snbeams   CeedBasis basis;
7827d8d0e25Snbeams   CeedBasis_Hip_shared *basis_data;
7837d8d0e25Snbeams   CeedElemRestriction Erestrict;
7847d8d0e25Snbeams   CeedElemRestriction_Hip *restr_data;
7857d8d0e25Snbeams 
7860b454692Sjeremylt   // Check for restriction only identity operator
7870b454692Sjeremylt   bool is_identity_qf;
7880b454692Sjeremylt   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
7890b454692Sjeremylt   if (is_identity_qf) {
7900b454692Sjeremylt     CeedEvalMode emodein, emodeout;
7910b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
7920b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
7930b454692Sjeremylt     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
7940b454692Sjeremylt       // LCOV_EXCL_START
7950b454692Sjeremylt       return CeedError(ceed, CEED_ERROR_BACKEND,
7960b454692Sjeremylt                        "Backend does not implement restriction only identity operators");
7970b454692Sjeremylt     // LCOV_EXCL_STOP
7980b454692Sjeremylt   }
7990b454692Sjeremylt 
8007d8d0e25Snbeams   ostringstream code;
8017d8d0e25Snbeams   string devFunctions(deviceFunctions);
8027d8d0e25Snbeams   code << devFunctions;
8037d8d0e25Snbeams 
8047d8d0e25Snbeams   string qFunction(qf_data->qFunctionSource);
8057d8d0e25Snbeams   string qFunctionName(qf_data->qFunctionName);
8067d8d0e25Snbeams   string oper;
8077d8d0e25Snbeams   oper = "CeedKernel_Hip_gen_" + qFunctionName;
8087d8d0e25Snbeams 
8097d8d0e25Snbeams   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
81077841947SLeila Ghaffari   code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n";
811e15f9bd0SJeremy L Thompson   code << "#define CeedPragmaSIMD\n";
812e15f9bd0SJeremy L Thompson   code << "#define CEED_ERROR_SUCCESS 0\n\n";
8137d8d0e25Snbeams 
8147d8d0e25Snbeams   // Find dim and Q1d
8157d8d0e25Snbeams   bool useCollograd = true;
8167d8d0e25Snbeams   data->maxP1d = 0;
8177d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
818e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
8197d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
820e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8217d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
822e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8237d8d0e25Snbeams 
8247d8d0e25Snbeams       // Check for collocated gradient
825437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8267d8d0e25Snbeams 
8277d8d0e25Snbeams       // Collect dim and Q1d
828e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8297d8d0e25Snbeams       bool isTensor;
830e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8317d8d0e25Snbeams       if (isTensor) {
832e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
833e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
8347d8d0e25Snbeams         if (P1d>data->maxP1d) data->maxP1d = P1d;
8357d8d0e25Snbeams       } else {
8367d8d0e25Snbeams         // LCOV_EXCL_START
837e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
8387d8d0e25Snbeams         // LCOV_EXCL_STOP
8397d8d0e25Snbeams         }
8407d8d0e25Snbeams     }
8417d8d0e25Snbeams   }
8427d8d0e25Snbeams   // Check output bases for Q1d, dim as well
8437d8d0e25Snbeams   //   The only imput basis might be CEED_BASIS_COLLOCATED
8447d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
845e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
8467d8d0e25Snbeams 
8477d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
848e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8497d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
850e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8517d8d0e25Snbeams 
8527d8d0e25Snbeams       // Collect dim and Q1d
853e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8547d8d0e25Snbeams       bool isTensor;
855e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8567d8d0e25Snbeams       if (isTensor) {
857e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
8587d8d0e25Snbeams       } else {
8597d8d0e25Snbeams         // LCOV_EXCL_START
860e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
8617d8d0e25Snbeams         // LCOV_EXCL_STOP
8627d8d0e25Snbeams         }
8637d8d0e25Snbeams 
8647d8d0e25Snbeams       // Check for collocated gradient
865437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8667d8d0e25Snbeams     }
8677d8d0e25Snbeams   }
8687d8d0e25Snbeams   data->dim = dim;
8697d8d0e25Snbeams   data->Q1d = Q1d;
8707d8d0e25Snbeams 
8717d8d0e25Snbeams   // Define CEED_Q_VLA
8727d8d0e25Snbeams   if (dim != 3 || useCollograd) {
8737d8d0e25Snbeams     code << "\n#define CEED_Q_VLA 1\n\n";
8747d8d0e25Snbeams   } else {
8757d8d0e25Snbeams     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
8767d8d0e25Snbeams   }
8777d8d0e25Snbeams 
8787d8d0e25Snbeams   code << qFunction;
8797d8d0e25Snbeams 
8807d8d0e25Snbeams   // Setup
8817d8d0e25Snbeams   code << "\n// -----------------------------------------------------------------------------\n";
882b3e1519bSnbeams   code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
883b3e1519bSnbeams   code << "__global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
8847d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
8857d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
886e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8877d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
8887d8d0e25Snbeams       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
8897d8d0e25Snbeams     }
8907d8d0e25Snbeams   }
8917d8d0e25Snbeams 
8927d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
8937d8d0e25Snbeams     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
8947d8d0e25Snbeams   }
8957d8d0e25Snbeams 
8967d8d0e25Snbeams   code << "  const CeedInt Dim = "<<dim<<";\n";
8977d8d0e25Snbeams   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
8987d8d0e25Snbeams 
8997d8d0e25Snbeams   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
9007d8d0e25Snbeams   code << "  BackendData data;\n";
9017d8d0e25Snbeams   code << "  data.tidx = threadIdx.x;\n";
9027d8d0e25Snbeams   code << "  data.tidy = threadIdx.y;\n";
9037d8d0e25Snbeams   code << "  data.tidz = threadIdx.z;\n";
9047d8d0e25Snbeams   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
9057d8d0e25Snbeams   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
9067d8d0e25Snbeams 
9077d8d0e25Snbeams   code << "\n  // -- Input field constants and basis data --\n";
9087d8d0e25Snbeams   //Initialize constants, and matrices B and G
9097d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
9107d8d0e25Snbeams     code << "  // ---- Input field "<<i<<" ----\n";
9117d8d0e25Snbeams     // Get elemsize, emode, ncomp
9127d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
913e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9147d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
915e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9167d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
917e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9187d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
919e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9207d8d0e25Snbeams 
9217d8d0e25Snbeams     // Set field constants
9227d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT) {
923e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
9247d8d0e25Snbeams       if (basis != CEED_BASIS_COLLOCATED) {
925e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
9267d8d0e25Snbeams         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
9277d8d0e25Snbeams       } else {
9287d8d0e25Snbeams         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
9297d8d0e25Snbeams       }
9307d8d0e25Snbeams       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
9317d8d0e25Snbeams     }
9327d8d0e25Snbeams 
9337d8d0e25Snbeams     // Load basis data
9347d8d0e25Snbeams     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
9357d8d0e25Snbeams     switch (emode) {
9367d8d0e25Snbeams     case CEED_EVAL_NONE:
9377d8d0e25Snbeams       break;
9387d8d0e25Snbeams     case CEED_EVAL_INTERP:
939e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
940437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
94180a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9427d8d0e25Snbeams       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
9437d8d0e25Snbeams       break;
9447d8d0e25Snbeams     case CEED_EVAL_GRAD:
945e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
946437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
94780a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9487d8d0e25Snbeams       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
9497d8d0e25Snbeams       if (useCollograd) {
950437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_collo_grad_1d;
95180a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
9527d8d0e25Snbeams         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
9537d8d0e25Snbeams       } else {
954437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_grad_1d;
95580a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9567d8d0e25Snbeams         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
9577d8d0e25Snbeams       }
9587d8d0e25Snbeams       break;
9597d8d0e25Snbeams     case CEED_EVAL_WEIGHT:
9607d8d0e25Snbeams       break; // No action
9617d8d0e25Snbeams     case CEED_EVAL_DIV:
9627d8d0e25Snbeams       break; // TODO: Not implemented
9637d8d0e25Snbeams     case CEED_EVAL_CURL:
9647d8d0e25Snbeams       break; // TODO: Not implemented
9657d8d0e25Snbeams     }
9667d8d0e25Snbeams   }
9677d8d0e25Snbeams 
9687d8d0e25Snbeams   code << "\n  // -- Output field constants and basis data --\n";
9697d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
9707d8d0e25Snbeams     code << "  // ---- Output field "<<i<<" ----\n";
9717d8d0e25Snbeams     // Get elemsize, emode, ncomp
9727d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
973e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9747d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
975e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9767d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
977e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9787d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
979e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9807d8d0e25Snbeams 
9817d8d0e25Snbeams     // Set field constants
982e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
9837d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
984e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
9857d8d0e25Snbeams       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
9867d8d0e25Snbeams     } else {
9877d8d0e25Snbeams       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
9887d8d0e25Snbeams     }
9897d8d0e25Snbeams     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
9907d8d0e25Snbeams 
9917d8d0e25Snbeams     // Load basis data
9927d8d0e25Snbeams     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
9937d8d0e25Snbeams     switch (emode) {
9947d8d0e25Snbeams     case CEED_EVAL_NONE:
9957d8d0e25Snbeams       break; // No action
9967d8d0e25Snbeams     case CEED_EVAL_INTERP:
997e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
998437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
99980a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
10007d8d0e25Snbeams       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
10017d8d0e25Snbeams       break;
10027d8d0e25Snbeams     case CEED_EVAL_GRAD:
1003e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1004437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
100580a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
10067d8d0e25Snbeams       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
10077d8d0e25Snbeams       if (useCollograd) {
1008437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_collo_grad_1d;
100980a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
10107d8d0e25Snbeams         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
10117d8d0e25Snbeams       } else {
1012437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_grad_1d;
101380a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
10147d8d0e25Snbeams         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
10157d8d0e25Snbeams       }
10167d8d0e25Snbeams       break;
10177d8d0e25Snbeams     // LCOV_EXCL_START
10187d8d0e25Snbeams     case CEED_EVAL_WEIGHT: {
10197d8d0e25Snbeams       Ceed ceed;
1020e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1021e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
10227d8d0e25Snbeams                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
10237d8d0e25Snbeams       break; // Should not occur
10247d8d0e25Snbeams     }
10257d8d0e25Snbeams     case CEED_EVAL_DIV:
10267d8d0e25Snbeams       break; // TODO: Not implemented
10277d8d0e25Snbeams     case CEED_EVAL_CURL:
10287d8d0e25Snbeams       break; // TODO: Not implemented
10297d8d0e25Snbeams       // LCOV_EXCL_STOP
10307d8d0e25Snbeams     }
10317d8d0e25Snbeams   }
10327d8d0e25Snbeams   code << "\n  // -- Element loop --\n";
10337d8d0e25Snbeams   code << "  __syncthreads();\n";
10347d8d0e25Snbeams   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
10357d8d0e25Snbeams   // Input basis apply if needed
10367d8d0e25Snbeams   // Generate the correct eval mode code for each input
10377d8d0e25Snbeams   code << "    // -- Input field restrictions and basis actions --\n";
10387d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
10397d8d0e25Snbeams     code << "    // ---- Input field "<<i<<" ----\n";
10407d8d0e25Snbeams     // Get elemsize, emode, ncomp
10417d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1042e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10437d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1044e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10457d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1046e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10477d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1048e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10497d8d0e25Snbeams 
10507d8d0e25Snbeams     // Restriction
10517d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT &&
10527d8d0e25Snbeams         !((emode == CEED_EVAL_NONE) && useCollograd)) {
10537d8d0e25Snbeams       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
10547d8d0e25Snbeams 
10557d8d0e25Snbeams       bool isStrided;
1056e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
10577d8d0e25Snbeams       if (!isStrided) {
10587d8d0e25Snbeams         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1059e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10607d8d0e25Snbeams         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
10617d8d0e25Snbeams         CeedInt compstride;
1062e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
10637d8d0e25Snbeams         code << "    // CompStride: "<<compstride<<"\n";
1064e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
10657d8d0e25Snbeams         data->indices.in[i] = restr_data->d_ind;
10667d8d0e25Snbeams         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
10677d8d0e25Snbeams       } else {
10687d8d0e25Snbeams         bool backendstrides;
10697d8d0e25Snbeams         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1070e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10717d8d0e25Snbeams         CeedInt nelem;
10727d8d0e25Snbeams         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1073e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10747d8d0e25Snbeams         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
10757d8d0e25Snbeams         if (!backendstrides) {
10767d8d0e25Snbeams           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1077e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
10787d8d0e25Snbeams         }
10797d8d0e25Snbeams         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
10807d8d0e25Snbeams         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
10817d8d0e25Snbeams       }
10827d8d0e25Snbeams     }
10837d8d0e25Snbeams 
10847d8d0e25Snbeams     // Basis action
10857d8d0e25Snbeams     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
10867d8d0e25Snbeams     switch (emode) {
10877d8d0e25Snbeams     case CEED_EVAL_NONE:
10887d8d0e25Snbeams       if (!useCollograd) {
10897d8d0e25Snbeams         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
10907d8d0e25Snbeams       }
10917d8d0e25Snbeams       break;
10927d8d0e25Snbeams     case CEED_EVAL_INTERP:
10937d8d0e25Snbeams       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
10947d8d0e25Snbeams       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
10957d8d0e25Snbeams       break;
10967d8d0e25Snbeams     case CEED_EVAL_GRAD:
10977d8d0e25Snbeams       if (useCollograd) {
10987d8d0e25Snbeams         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
10997d8d0e25Snbeams         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
11007d8d0e25Snbeams       } else {
11017d8d0e25Snbeams         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
11027d8d0e25Snbeams         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
11037d8d0e25Snbeams       }
11047d8d0e25Snbeams       break;
11057d8d0e25Snbeams     case CEED_EVAL_WEIGHT:
11067d8d0e25Snbeams       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1107e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1108e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1109437930d1SJeremy L Thompson       data->W = basis_data->d_q_weight_1d;
11107d8d0e25Snbeams       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
11117d8d0e25Snbeams       break; // No action
11127d8d0e25Snbeams     case CEED_EVAL_DIV:
11137d8d0e25Snbeams       break; // TODO: Not implemented
11147d8d0e25Snbeams     case CEED_EVAL_CURL:
11157d8d0e25Snbeams       break; // TODO: Not implemented
11167d8d0e25Snbeams     }
11177d8d0e25Snbeams   }
11187d8d0e25Snbeams 
11197d8d0e25Snbeams   // Q function
11207d8d0e25Snbeams   code << "\n    // -- Output field setup --\n";
11217d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
11227d8d0e25Snbeams       code << "\n    // ---- Output field "<<i<<" ----\n";
11237d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1124e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
11257d8d0e25Snbeams     if (emode==CEED_EVAL_GRAD)
11267d8d0e25Snbeams     {
11277d8d0e25Snbeams       if (useCollograd) {
11287d8d0e25Snbeams         //Accumulator for gradient slices
11297d8d0e25Snbeams         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
11307d8d0e25Snbeams         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
11317d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
11327d8d0e25Snbeams         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
11337d8d0e25Snbeams         code << "      }\n";
11347d8d0e25Snbeams         code << "    }\n";
11357d8d0e25Snbeams       } else {
11367d8d0e25Snbeams         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
11377d8d0e25Snbeams       }
11387d8d0e25Snbeams     }
11397d8d0e25Snbeams     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
11407d8d0e25Snbeams     {
11417d8d0e25Snbeams       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
11427d8d0e25Snbeams     }
11437d8d0e25Snbeams   }
11447d8d0e25Snbeams   // We treat quadrature points per slice in 3d to save registers
11457d8d0e25Snbeams   if (useCollograd) {
11467d8d0e25Snbeams     code << "\n    // Note: Collocated Gradient\n";
11477d8d0e25Snbeams     code << "#pragma unroll\n";
11487d8d0e25Snbeams     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
11497d8d0e25Snbeams     code << "      // -- Input fields --\n";
11507d8d0e25Snbeams     for (CeedInt i = 0; i < numinputfields; i++) {
11517d8d0e25Snbeams       code << "      // ---- Input field "<<i<<" ----\n";
11527d8d0e25Snbeams       // Get elemsize, emode, ncomp
11537d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1154e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
11557d8d0e25Snbeams       // Basis action
11567d8d0e25Snbeams       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
11577d8d0e25Snbeams       switch (emode) {
11587d8d0e25Snbeams       case CEED_EVAL_NONE:
11597d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11607d8d0e25Snbeams 
11617d8d0e25Snbeams         bool isStrided;
1162e15f9bd0SJeremy L Thompson         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1163e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1164e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
11657d8d0e25Snbeams         if (!isStrided) {
11667d8d0e25Snbeams           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1167e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11687d8d0e25Snbeams           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
11697d8d0e25Snbeams           CeedInt compstride;
1170e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
11717d8d0e25Snbeams           code << "      // CompStride: "<<compstride<<"\n";
1172e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
11737d8d0e25Snbeams           data->indices.in[i] = restr_data->d_ind;
11747d8d0e25Snbeams           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
11757d8d0e25Snbeams         } else {
11767d8d0e25Snbeams           bool backendstrides;
11777d8d0e25Snbeams           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1178e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11797d8d0e25Snbeams           CeedInt nelem;
11807d8d0e25Snbeams           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1181e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11827d8d0e25Snbeams           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
11837d8d0e25Snbeams           if (!backendstrides) {
11847d8d0e25Snbeams             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1185e15f9bd0SJeremy L Thompson             CeedChkBackend(ierr);
11867d8d0e25Snbeams           }
11877d8d0e25Snbeams           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
11887d8d0e25Snbeams           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
11897d8d0e25Snbeams         }
11907d8d0e25Snbeams         break;
11917d8d0e25Snbeams       case CEED_EVAL_INTERP:
11927d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11937d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
11947d8d0e25Snbeams         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
11957d8d0e25Snbeams         code << "      }\n";
11967d8d0e25Snbeams         break;
11977d8d0e25Snbeams       case CEED_EVAL_GRAD:
11987d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
11997d8d0e25Snbeams         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
12007d8d0e25Snbeams         break;
12017d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
12027d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[1];\n";
12037d8d0e25Snbeams         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
12047d8d0e25Snbeams         break; // No action
12057d8d0e25Snbeams       case CEED_EVAL_DIV:
12067d8d0e25Snbeams         break; // TODO: Not implemented
12077d8d0e25Snbeams       case CEED_EVAL_CURL:
12087d8d0e25Snbeams         break; // TODO: Not implemented
12097d8d0e25Snbeams       }
12107d8d0e25Snbeams     }
12117d8d0e25Snbeams     code << "\n      // -- Output fields --\n";
12127d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
12137d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
12147d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1215e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
12167d8d0e25Snbeams       // Basis action
12177d8d0e25Snbeams       switch (emode) {
12187d8d0e25Snbeams       case CEED_EVAL_NONE:
12197d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
12207d8d0e25Snbeams         break; // No action
12217d8d0e25Snbeams       case CEED_EVAL_INTERP:
12227d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
12237d8d0e25Snbeams         break;
12247d8d0e25Snbeams       case CEED_EVAL_GRAD:
12257d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
12267d8d0e25Snbeams         break;
12277d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
12287d8d0e25Snbeams         break; // Should not occur
12297d8d0e25Snbeams       case CEED_EVAL_DIV:
12307d8d0e25Snbeams         break; // TODO: Not implemented
12317d8d0e25Snbeams       case CEED_EVAL_CURL:
12327d8d0e25Snbeams         break; // TODO: Not implemented
12337d8d0e25Snbeams       }
12347d8d0e25Snbeams     }
12357d8d0e25Snbeams   } else {
12367d8d0e25Snbeams     code << "\n      // Note: No Collocated Gradient\n";
12377d8d0e25Snbeams     code << "      // -- Input fields --\n";
12387d8d0e25Snbeams     for (CeedInt i = 0; i < numinputfields; i++) {
12397d8d0e25Snbeams       code << "      // ---- Input field "<<i<<" ----\n";
12407d8d0e25Snbeams       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
12417d8d0e25Snbeams     }
12427d8d0e25Snbeams     code << "      // -- Output fields --\n";
12437d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
12447d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
12457d8d0e25Snbeams       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
12467d8d0e25Snbeams     }
12477d8d0e25Snbeams   }
12487d8d0e25Snbeams   code << "\n      // -- QFunction Inputs and outputs --\n";
12497d8d0e25Snbeams   code << "      CeedScalar* in["<<numinputfields<<"];\n";
12507d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
12517d8d0e25Snbeams     code << "      // ---- Input field "<<i<<" ----\n";
12527d8d0e25Snbeams     code << "      in["<<i<<"] = r_q"<<i<<";\n";
12537d8d0e25Snbeams   }
12547d8d0e25Snbeams   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
12557d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
12567d8d0e25Snbeams     code << "      // ---- Output field "<<i<<" ----\n";
12577d8d0e25Snbeams     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
12587d8d0e25Snbeams   }
12597d8d0e25Snbeams   code << "\n      // -- Apply QFunction --\n";
12607d8d0e25Snbeams   code << "      "<<qFunctionName<<"(ctx, ";
12617d8d0e25Snbeams   if (dim != 3 || useCollograd) {
12627d8d0e25Snbeams     code << "1";
12637d8d0e25Snbeams   } else {
12647d8d0e25Snbeams     code << "Q1d";
12657d8d0e25Snbeams   }
12667d8d0e25Snbeams   code << ", in, out);\n";
12677d8d0e25Snbeams   if (useCollograd) {
12687d8d0e25Snbeams     code << "\n      // Note: Collocated Gradient\n";
12697d8d0e25Snbeams     code << "      // -- Output fields --\n";
12707d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
12717d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
12727d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1273e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
12747d8d0e25Snbeams       // Basis action
12757d8d0e25Snbeams       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
12767d8d0e25Snbeams       switch (emode) {
12777d8d0e25Snbeams       case CEED_EVAL_NONE:
12787d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
12797d8d0e25Snbeams         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
12807d8d0e25Snbeams         code << "      }\n";
12817d8d0e25Snbeams         break; // No action
12827d8d0e25Snbeams       case CEED_EVAL_INTERP:
12837d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
12847d8d0e25Snbeams         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
12857d8d0e25Snbeams         code << "      }\n";
12867d8d0e25Snbeams         break;
12877d8d0e25Snbeams       case CEED_EVAL_GRAD:
12887d8d0e25Snbeams         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
12897d8d0e25Snbeams         break;
12907d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
12917d8d0e25Snbeams         break; // Should not occur
12927d8d0e25Snbeams       case CEED_EVAL_DIV:
12937d8d0e25Snbeams         break; // TODO: Not implemented
12947d8d0e25Snbeams       case CEED_EVAL_CURL:
12957d8d0e25Snbeams         break; // TODO: Not implemented
12967d8d0e25Snbeams       }
12977d8d0e25Snbeams     }
12987d8d0e25Snbeams     code << "    }\n";
12997d8d0e25Snbeams   }
13007d8d0e25Snbeams 
13017d8d0e25Snbeams   // Output basis apply if needed
13027d8d0e25Snbeams   // Generate the correct eval mode code for each output
13037d8d0e25Snbeams   code << "\n    // -- Output field basis action and restrictions --\n";
13047d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
13057d8d0e25Snbeams     code << "    // ---- Output field "<<i<<" ----\n";
13067d8d0e25Snbeams     // Get elemsize, emode, ncomp
13077d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1308e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13097d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1310e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13117d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1312e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13137d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1314e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13157d8d0e25Snbeams     // Basis action
13167d8d0e25Snbeams     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
13177d8d0e25Snbeams     switch (emode) {
13187d8d0e25Snbeams     case CEED_EVAL_NONE:
13197d8d0e25Snbeams       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
13207d8d0e25Snbeams       break; // No action
13217d8d0e25Snbeams     case CEED_EVAL_INTERP:
13227d8d0e25Snbeams       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
13237d8d0e25Snbeams       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
13247d8d0e25Snbeams       break;
13257d8d0e25Snbeams     case CEED_EVAL_GRAD:
13267d8d0e25Snbeams       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
13277d8d0e25Snbeams       if (useCollograd) {
13287d8d0e25Snbeams         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
13297d8d0e25Snbeams       } else {
13307d8d0e25Snbeams         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
13317d8d0e25Snbeams       }
13327d8d0e25Snbeams       break;
13337d8d0e25Snbeams     // LCOV_EXCL_START
13347d8d0e25Snbeams     case CEED_EVAL_WEIGHT: {
13357d8d0e25Snbeams       Ceed ceed;
1336e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1337e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
13387d8d0e25Snbeams                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
13397d8d0e25Snbeams       break; // Should not occur
13407d8d0e25Snbeams     }
13417d8d0e25Snbeams     case CEED_EVAL_DIV:
13427d8d0e25Snbeams       break; // TODO: Not implemented
13437d8d0e25Snbeams     case CEED_EVAL_CURL:
13447d8d0e25Snbeams       break; // TODO: Not implemented
13457d8d0e25Snbeams       // LCOV_EXCL_STOP
13467d8d0e25Snbeams     }
13477d8d0e25Snbeams     // Restriction
13487d8d0e25Snbeams       bool isStrided;
1349e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
13507d8d0e25Snbeams     if (!isStrided) {
13517d8d0e25Snbeams       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1352e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13537d8d0e25Snbeams       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
13547d8d0e25Snbeams       CeedInt compstride;
1355e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
13567d8d0e25Snbeams       code << "    // CompStride: "<<compstride<<"\n";
1357e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
13587d8d0e25Snbeams       data->indices.out[i] = restr_data->d_ind;
13597d8d0e25Snbeams       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
13607d8d0e25Snbeams     } else {
13617d8d0e25Snbeams       bool backendstrides;
13627d8d0e25Snbeams       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1363e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13647d8d0e25Snbeams       CeedInt nelem;
13657d8d0e25Snbeams       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1366e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13677d8d0e25Snbeams       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
13687d8d0e25Snbeams       if (!backendstrides) {
13697d8d0e25Snbeams         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1370e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
13717d8d0e25Snbeams       }
13727d8d0e25Snbeams       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
13737d8d0e25Snbeams       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
13747d8d0e25Snbeams     }
13757d8d0e25Snbeams   }
13767d8d0e25Snbeams 
13777d8d0e25Snbeams   code << "  }\n";
13787d8d0e25Snbeams   code << "}\n";
13797d8d0e25Snbeams   code << "// -----------------------------------------------------------------------------\n\n";
13807d8d0e25Snbeams 
13817d8d0e25Snbeams   // View kernel for debugging
138246dc0734SJeremy L Thompson   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
13833f21f6b1SJeremy L Thompson   CeedDebug(ceed, code.str().c_str());
13847d8d0e25Snbeams 
1385b3e1519bSnbeams   CeedInt block_sizes[3];
138637c3b1cfSnbeams   ierr = BlockGridCalculate_Hip_gen(dim, numelements, data->maxP1d, Q1d, block_sizes);
1387b3e1519bSnbeams   CeedChkBackend(ierr);
1388b3e1519bSnbeams   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2,
1389b3e1519bSnbeams                          "T1d", block_sizes[0],
1390b3e1519bSnbeams 			 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]);
1391e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
13927d8d0e25Snbeams   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1393e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
13947d8d0e25Snbeams 
1395e15f9bd0SJeremy L Thompson   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1396e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
13977d8d0e25Snbeams }
13987d8d0e25Snbeams //------------------------------------------------------------------------------
1399