xref: /libCEED/rust/libceed-sys/c-src/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision b3e1519b234fedc04be9e4332adea84b1c794820)
17d8d0e25Snbeams // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
27d8d0e25Snbeams // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
37d8d0e25Snbeams // All Rights reserved. See files LICENSE and NOTICE for details.
47d8d0e25Snbeams //
57d8d0e25Snbeams // This file is part of CEED, a collection of benchmarks, miniapps, software
67d8d0e25Snbeams // libraries and APIs for efficient high-order finite element and spectral
77d8d0e25Snbeams // element discretizations for exascale applications. For more information and
87d8d0e25Snbeams // source code availability see http://github.com/ceed.
97d8d0e25Snbeams //
107d8d0e25Snbeams // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
117d8d0e25Snbeams // a collaborative effort of two U.S. Department of Energy organizations (Office
127d8d0e25Snbeams // of Science and the National Nuclear Security Administration) responsible for
137d8d0e25Snbeams // the planning and preparation of a capable exascale ecosystem, including
147d8d0e25Snbeams // software, applications, hardware, advanced system engineering and early
157d8d0e25Snbeams // testbed platforms, in support of the nation's exascale computing imperative.
163d576824SJeremy L Thompson 
177d8d0e25Snbeams #define CEED_DEBUG_COLOR 12
187d8d0e25Snbeams 
19ec3da8bcSJed Brown #include <ceed/ceed.h>
20ec3da8bcSJed Brown #include <ceed/backend.h>
217d8d0e25Snbeams #include <iostream>
223d576824SJeremy L Thompson #include <string>
237d8d0e25Snbeams #include <sstream>
243d576824SJeremy L Thompson #include "ceed-hip-gen.h"
250d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h"
267d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h"
277d8d0e25Snbeams #include "../hip/ceed-hip-compile.h"
287d8d0e25Snbeams 
29*b3e1519bSnbeams 
30*b3e1519bSnbeams //------------------------------------------------------------------------------
31*b3e1519bSnbeams // Calculate the block size used for launching the operator kernel
32*b3e1519bSnbeams //------------------------------------------------------------------------------
33*b3e1519bSnbeams CEED_INTERN int BlockGridCalculate(const CeedInt dim, const CeedInt nelem,
34*b3e1519bSnbeams 		                   const CeedInt P1d, const CeedInt Q1d,
35*b3e1519bSnbeams 				   CeedInt *block_sizes) {
36*b3e1519bSnbeams 
37*b3e1519bSnbeams   const CeedInt thread1d = CeedIntMax(Q1d, P1d);
38*b3e1519bSnbeams   if (dim==1) {
39*b3e1519bSnbeams     CeedInt elemsPerBlock = 64*thread1d > 256? 256/thread1d : 64;
40*b3e1519bSnbeams     elemsPerBlock = elemsPerBlock>0?elemsPerBlock:1;
41*b3e1519bSnbeams     block_sizes[0] = thread1d;
42*b3e1519bSnbeams     block_sizes[1] = 1;
43*b3e1519bSnbeams     block_sizes[2] = elemsPerBlock;
44*b3e1519bSnbeams   } else if (dim==2) {
45*b3e1519bSnbeams     const CeedInt elemsPerBlock = thread1d<4? 16 : 2;
46*b3e1519bSnbeams     block_sizes[0] = thread1d;
47*b3e1519bSnbeams     block_sizes[1] = thread1d;
48*b3e1519bSnbeams     block_sizes[2] = elemsPerBlock;
49*b3e1519bSnbeams   } else if (dim==3) {
50*b3e1519bSnbeams     const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1);
51*b3e1519bSnbeams     block_sizes[0] = thread1d;
52*b3e1519bSnbeams     block_sizes[1] = thread1d;
53*b3e1519bSnbeams     block_sizes[2] = elemsPerBlock;
54*b3e1519bSnbeams   }
55*b3e1519bSnbeams   return CEED_ERROR_SUCCESS;
56*b3e1519bSnbeams }
57*b3e1519bSnbeams 
587d8d0e25Snbeams static const char *deviceFunctions = QUOTE(
597d8d0e25Snbeams 
607d8d0e25Snbeams //------------------------------------------------------------------------------
617d8d0e25Snbeams // Typedefs
627d8d0e25Snbeams //------------------------------------------------------------------------------
637d8d0e25Snbeams typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
647d8d0e25Snbeams typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
657d8d0e25Snbeams 
667d8d0e25Snbeams typedef struct {
677d8d0e25Snbeams   CeedInt tidx;
687d8d0e25Snbeams   CeedInt tidy;
697d8d0e25Snbeams   CeedInt tidz;
707d8d0e25Snbeams   CeedInt tid;
717d8d0e25Snbeams   CeedScalar* slice;
727d8d0e25Snbeams } BackendData;
737d8d0e25Snbeams 
747d8d0e25Snbeams //------------------------------------------------------------------------------
757d8d0e25Snbeams // Load matrices for basis actions
767d8d0e25Snbeams //------------------------------------------------------------------------------
777d8d0e25Snbeams template <int P, int Q>
787d8d0e25Snbeams inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
797d8d0e25Snbeams   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
807d8d0e25Snbeams     B[i] = d_B[i];
817d8d0e25Snbeams }
827d8d0e25Snbeams 
837d8d0e25Snbeams //------------------------------------------------------------------------------
847d8d0e25Snbeams // 1D
857d8d0e25Snbeams //------------------------------------------------------------------------------
867d8d0e25Snbeams 
877d8d0e25Snbeams //------------------------------------------------------------------------------
887d8d0e25Snbeams // L-vector -> E-vector, offsets provided
897d8d0e25Snbeams //------------------------------------------------------------------------------
907d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
917d8d0e25Snbeams inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
927d8d0e25Snbeams   if (data.tidx < P1d) {
937d8d0e25Snbeams     const CeedInt node = data.tidx;
947d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d];
957d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
967d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
977d8d0e25Snbeams   }
987d8d0e25Snbeams }
997d8d0e25Snbeams 
1007d8d0e25Snbeams //------------------------------------------------------------------------------
1017d8d0e25Snbeams // L-vector -> E-vector, strided
1027d8d0e25Snbeams //------------------------------------------------------------------------------
1037d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
1047d8d0e25Snbeams inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
1057d8d0e25Snbeams   if (data.tidx < P1d) {
1067d8d0e25Snbeams     const CeedInt node = data.tidx;
1077d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
1087d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1097d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
1107d8d0e25Snbeams   }
1117d8d0e25Snbeams }
1127d8d0e25Snbeams 
1137d8d0e25Snbeams //------------------------------------------------------------------------------
1147d8d0e25Snbeams // E-vector -> L-vector, offsets provided
1157d8d0e25Snbeams //------------------------------------------------------------------------------
1167d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
1177d8d0e25Snbeams inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
1187d8d0e25Snbeams   if (data.tidx < P1d) {
1197d8d0e25Snbeams     const CeedInt node = data.tidx;
1207d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d];
1217d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1227d8d0e25Snbeams       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
1237d8d0e25Snbeams   }
1247d8d0e25Snbeams }
1257d8d0e25Snbeams 
1267d8d0e25Snbeams //------------------------------------------------------------------------------
1277d8d0e25Snbeams // E-vector -> L-vector, strided
1287d8d0e25Snbeams //------------------------------------------------------------------------------
1297d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
1307d8d0e25Snbeams inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
1317d8d0e25Snbeams   if (data.tidx < P1d) {
1327d8d0e25Snbeams     const CeedInt node = data.tidx;
1337d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
1347d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1357d8d0e25Snbeams       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
1367d8d0e25Snbeams   }
1377d8d0e25Snbeams }
1387d8d0e25Snbeams 
1397d8d0e25Snbeams //------------------------------------------------------------------------------
1407d8d0e25Snbeams // 1D tensor contraction x
1417d8d0e25Snbeams //------------------------------------------------------------------------------
1427d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1437d8d0e25Snbeams inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1447d8d0e25Snbeams   data.slice[data.tidx] = *U;
1457d8d0e25Snbeams   __syncthreads();
1467d8d0e25Snbeams   *V = 0.0;
1477d8d0e25Snbeams   if (data.tidx < Q1d)
1487d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
1497d8d0e25Snbeams       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
1507d8d0e25Snbeams   __syncthreads();
1517d8d0e25Snbeams }
1527d8d0e25Snbeams 
1537d8d0e25Snbeams //------------------------------------------------------------------------------
1547d8d0e25Snbeams // 1D transpose tensor contraction x
1557d8d0e25Snbeams //------------------------------------------------------------------------------
1567d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1577d8d0e25Snbeams inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1587d8d0e25Snbeams   data.slice[data.tidx] = *U;
1597d8d0e25Snbeams   __syncthreads();
1607d8d0e25Snbeams   *V = 0.0;
1617d8d0e25Snbeams   if (data.tidx < P1d)
1627d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
1637d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
1647d8d0e25Snbeams   __syncthreads();
1657d8d0e25Snbeams }
1667d8d0e25Snbeams 
1677d8d0e25Snbeams //------------------------------------------------------------------------------
1687d8d0e25Snbeams // 1D interpolate to quadrature points
1697d8d0e25Snbeams //------------------------------------------------------------------------------
1707d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1717d8d0e25Snbeams inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
1727d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
1737d8d0e25Snbeams     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
1747d8d0e25Snbeams }
1757d8d0e25Snbeams 
1767d8d0e25Snbeams //------------------------------------------------------------------------------
1777d8d0e25Snbeams // 1D interpolate transpose
1787d8d0e25Snbeams //------------------------------------------------------------------------------
1797d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1807d8d0e25Snbeams inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
1817d8d0e25Snbeams   for (CeedInt comp=0; comp<NCOMP; comp++)
1827d8d0e25Snbeams     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
1837d8d0e25Snbeams }
1847d8d0e25Snbeams 
1857d8d0e25Snbeams //------------------------------------------------------------------------------
1867d8d0e25Snbeams // 1D derivatives at quadrature points
1877d8d0e25Snbeams //------------------------------------------------------------------------------
1887d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1897d8d0e25Snbeams inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
1907d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
1917d8d0e25Snbeams     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
1927d8d0e25Snbeams }
1937d8d0e25Snbeams 
1947d8d0e25Snbeams //------------------------------------------------------------------------------
1957d8d0e25Snbeams // 1D derivatives transpose
1967d8d0e25Snbeams //------------------------------------------------------------------------------
1977d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1987d8d0e25Snbeams inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
1997d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
2007d8d0e25Snbeams     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
2017d8d0e25Snbeams }
2027d8d0e25Snbeams 
2037d8d0e25Snbeams //------------------------------------------------------------------------------
2047d8d0e25Snbeams // 2D
2057d8d0e25Snbeams //------------------------------------------------------------------------------
2067d8d0e25Snbeams 
2077d8d0e25Snbeams //------------------------------------------------------------------------------
2087d8d0e25Snbeams // L-vector -> E-vector, offsets provided
2097d8d0e25Snbeams //------------------------------------------------------------------------------
2107d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
2117d8d0e25Snbeams inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
2127d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2137d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2147d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d*P1d];
2157d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2167d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
2177d8d0e25Snbeams   }
2187d8d0e25Snbeams }
2197d8d0e25Snbeams 
2207d8d0e25Snbeams //------------------------------------------------------------------------------
2217d8d0e25Snbeams // L-vector -> E-vector, strided
2227d8d0e25Snbeams //------------------------------------------------------------------------------
2237d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
2247d8d0e25Snbeams inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
2257d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2267d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2277d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
2287d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2297d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
2307d8d0e25Snbeams   }
2317d8d0e25Snbeams }
2327d8d0e25Snbeams 
2337d8d0e25Snbeams //------------------------------------------------------------------------------
2347d8d0e25Snbeams // E-vector -> L-vector, offsets provided
2357d8d0e25Snbeams //------------------------------------------------------------------------------
2367d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
2377d8d0e25Snbeams inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
2387d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2397d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2407d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d*P1d];
2417d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2427d8d0e25Snbeams       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
2437d8d0e25Snbeams   }
2447d8d0e25Snbeams }
2457d8d0e25Snbeams 
2467d8d0e25Snbeams //------------------------------------------------------------------------------
2477d8d0e25Snbeams // E-vector -> L-vector, strided
2487d8d0e25Snbeams //------------------------------------------------------------------------------
2497d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
2507d8d0e25Snbeams inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
2517d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2527d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2537d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
2547d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2557d8d0e25Snbeams       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
2567d8d0e25Snbeams   }
2577d8d0e25Snbeams }
2587d8d0e25Snbeams 
2597d8d0e25Snbeams //------------------------------------------------------------------------------
2607d8d0e25Snbeams // 2D tensor contraction x
2617d8d0e25Snbeams //------------------------------------------------------------------------------
2627d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2637d8d0e25Snbeams inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2647d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2657d8d0e25Snbeams   __syncthreads();
2667d8d0e25Snbeams   *V = 0.0;
2677d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < P1d)
2687d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
2697d8d0e25Snbeams       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
2707d8d0e25Snbeams   __syncthreads();
2717d8d0e25Snbeams }
2727d8d0e25Snbeams 
2737d8d0e25Snbeams //------------------------------------------------------------------------------
2747d8d0e25Snbeams // 2D tensor contract y
2757d8d0e25Snbeams //------------------------------------------------------------------------------
2767d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2777d8d0e25Snbeams inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2787d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2797d8d0e25Snbeams   __syncthreads();
2807d8d0e25Snbeams   *V = 0.0;
2817d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d)
2827d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
2837d8d0e25Snbeams       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
2847d8d0e25Snbeams   __syncthreads();
2857d8d0e25Snbeams }
2867d8d0e25Snbeams 
2877d8d0e25Snbeams //------------------------------------------------------------------------------
2887d8d0e25Snbeams // 2D transpose tensor contract y
2897d8d0e25Snbeams //------------------------------------------------------------------------------
2907d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2917d8d0e25Snbeams inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2927d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2937d8d0e25Snbeams   __syncthreads();
2947d8d0e25Snbeams   *V = 0.0;
2957d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < P1d)
2967d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
2977d8d0e25Snbeams       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
2987d8d0e25Snbeams   __syncthreads();
2997d8d0e25Snbeams }
3007d8d0e25Snbeams 
3017d8d0e25Snbeams //------------------------------------------------------------------------------
3027d8d0e25Snbeams // 2D transpose tensor contract x
3037d8d0e25Snbeams //------------------------------------------------------------------------------
3047d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3057d8d0e25Snbeams inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3067d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
3077d8d0e25Snbeams   __syncthreads();
3087d8d0e25Snbeams   *V = 0.0;
3097d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3107d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
3117d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
3127d8d0e25Snbeams   __syncthreads();
3137d8d0e25Snbeams }
3147d8d0e25Snbeams 
3157d8d0e25Snbeams //------------------------------------------------------------------------------
3167d8d0e25Snbeams // 2D transpose tensor contract and add x
3177d8d0e25Snbeams //------------------------------------------------------------------------------
3187d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3197d8d0e25Snbeams inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3207d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
3217d8d0e25Snbeams   __syncthreads();
3227d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3237d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
3247d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
3257d8d0e25Snbeams   __syncthreads();
3267d8d0e25Snbeams }
3277d8d0e25Snbeams 
3287d8d0e25Snbeams //------------------------------------------------------------------------------
3297d8d0e25Snbeams // 2D interpolate to quadrature points
3307d8d0e25Snbeams //------------------------------------------------------------------------------
3317d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3327d8d0e25Snbeams inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
3337d8d0e25Snbeams   CeedScalar r_t[1];
3347d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3357d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3367d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3377d8d0e25Snbeams   }
3387d8d0e25Snbeams }
3397d8d0e25Snbeams 
3407d8d0e25Snbeams //------------------------------------------------------------------------------
3417d8d0e25Snbeams // 2D interpolate transpose
3427d8d0e25Snbeams //------------------------------------------------------------------------------
3437d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3447d8d0e25Snbeams inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
3457d8d0e25Snbeams   CeedScalar r_t[1];
3467d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3477d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3487d8d0e25Snbeams     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3497d8d0e25Snbeams   }
3507d8d0e25Snbeams }
3517d8d0e25Snbeams 
3527d8d0e25Snbeams //------------------------------------------------------------------------------
3537d8d0e25Snbeams // 2D derivatives at quadrature points
3547d8d0e25Snbeams //------------------------------------------------------------------------------
3557d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3567d8d0e25Snbeams inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
3577d8d0e25Snbeams   CeedScalar r_t[1];
3587d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3597d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
3607d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
3617d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3627d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
3637d8d0e25Snbeams   }
3647d8d0e25Snbeams }
3657d8d0e25Snbeams 
3667d8d0e25Snbeams //------------------------------------------------------------------------------
3677d8d0e25Snbeams // 2D derivatives transpose
3687d8d0e25Snbeams //------------------------------------------------------------------------------
3697d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3707d8d0e25Snbeams inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
3717d8d0e25Snbeams   CeedScalar r_t[1];
3727d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3737d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
3747d8d0e25Snbeams     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
3757d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
3767d8d0e25Snbeams     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3777d8d0e25Snbeams   }
3787d8d0e25Snbeams }
3797d8d0e25Snbeams 
3807d8d0e25Snbeams //------------------------------------------------------------------------------
3817d8d0e25Snbeams // 3D
3827d8d0e25Snbeams //------------------------------------------------------------------------------
3837d8d0e25Snbeams 
3847d8d0e25Snbeams //------------------------------------------------------------------------------
3857d8d0e25Snbeams // L-vector -> E-vector, offsets provided
3867d8d0e25Snbeams //------------------------------------------------------------------------------
3877d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
3887d8d0e25Snbeams inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
3897d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3907d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
3917d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
3927d8d0e25Snbeams       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
3937d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3947d8d0e25Snbeams         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
3957d8d0e25Snbeams     }
3967d8d0e25Snbeams }
3977d8d0e25Snbeams 
3987d8d0e25Snbeams //------------------------------------------------------------------------------
3997d8d0e25Snbeams // L-vector -> E-vector, strided
4007d8d0e25Snbeams //------------------------------------------------------------------------------
4017d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4027d8d0e25Snbeams inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
4037d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
4047d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
4057d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4067d8d0e25Snbeams       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
4077d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4087d8d0e25Snbeams         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
4097d8d0e25Snbeams     }
4107d8d0e25Snbeams }
4117d8d0e25Snbeams 
4127d8d0e25Snbeams //------------------------------------------------------------------------------
4137d8d0e25Snbeams // E-vector -> Q-vector, offests provided
4147d8d0e25Snbeams //------------------------------------------------------------------------------
4157d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int Q1d>
4167d8d0e25Snbeams inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
4177d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
4187d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
4197d8d0e25Snbeams     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
4207d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4217d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
4227d8d0e25Snbeams   }
4237d8d0e25Snbeams }
4247d8d0e25Snbeams 
4257d8d0e25Snbeams //------------------------------------------------------------------------------
4267d8d0e25Snbeams // E-vector -> Q-vector, strided
4277d8d0e25Snbeams //------------------------------------------------------------------------------
4287d8d0e25Snbeams template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4297d8d0e25Snbeams inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
4307d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
4317d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
4327d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
4337d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4347d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
4357d8d0e25Snbeams   }
4367d8d0e25Snbeams }
4377d8d0e25Snbeams 
4387d8d0e25Snbeams //------------------------------------------------------------------------------
4397d8d0e25Snbeams // E-vector -> L-vector, offsets provided
4407d8d0e25Snbeams //------------------------------------------------------------------------------
4417d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
4427d8d0e25Snbeams inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
4437d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
4447d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
4457d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4467d8d0e25Snbeams       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
4477d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4487d8d0e25Snbeams         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
4497d8d0e25Snbeams     }
4507d8d0e25Snbeams }
4517d8d0e25Snbeams 
4527d8d0e25Snbeams //------------------------------------------------------------------------------
4537d8d0e25Snbeams // E-vector -> L-vector, strided
4547d8d0e25Snbeams //------------------------------------------------------------------------------
4557d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4567d8d0e25Snbeams inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
4577d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
4587d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
4597d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4607d8d0e25Snbeams       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
4617d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4627d8d0e25Snbeams         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
4637d8d0e25Snbeams     }
4647d8d0e25Snbeams }
4657d8d0e25Snbeams 
4667d8d0e25Snbeams //------------------------------------------------------------------------------
4677d8d0e25Snbeams // 3D tensor contract x
4687d8d0e25Snbeams //------------------------------------------------------------------------------
4697d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
4707d8d0e25Snbeams inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4717d8d0e25Snbeams   CeedScalar r_B[P1d];
4727d8d0e25Snbeams   for (CeedInt i = 0; i < P1d; ++i)
4737d8d0e25Snbeams     r_B[i] = B[i + data.tidx*P1d];
4747d8d0e25Snbeams 
4757d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
4767d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
4777d8d0e25Snbeams     __syncthreads();
4787d8d0e25Snbeams     V[k] = 0.0;
4797d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
4807d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
4817d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
4827d8d0e25Snbeams     __syncthreads();
4837d8d0e25Snbeams   }
4847d8d0e25Snbeams }
4857d8d0e25Snbeams 
4867d8d0e25Snbeams //------------------------------------------------------------------------------
4877d8d0e25Snbeams // 3D tensor contract y
4887d8d0e25Snbeams //------------------------------------------------------------------------------
4897d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
4907d8d0e25Snbeams inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4917d8d0e25Snbeams   CeedScalar r_B[P1d];
4927d8d0e25Snbeams   for (CeedInt i = 0; i < P1d; ++i)
4937d8d0e25Snbeams     r_B[i] = B[i + data.tidy*P1d];
4947d8d0e25Snbeams 
4957d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
4967d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
4977d8d0e25Snbeams     __syncthreads();
4987d8d0e25Snbeams     V[k] = 0.0;
4997d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
5007d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
5017d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
5027d8d0e25Snbeams     __syncthreads();
5037d8d0e25Snbeams   }
5047d8d0e25Snbeams }
5057d8d0e25Snbeams 
5067d8d0e25Snbeams //------------------------------------------------------------------------------
5077d8d0e25Snbeams // 3D tensor contract z
5087d8d0e25Snbeams //------------------------------------------------------------------------------
5097d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5107d8d0e25Snbeams inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5117d8d0e25Snbeams   for (CeedInt k = 0; k < Q1d; ++k) {
5127d8d0e25Snbeams     V[k] = 0.0;
5137d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
5147d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
5157d8d0e25Snbeams         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
5167d8d0e25Snbeams   }
5177d8d0e25Snbeams }
5187d8d0e25Snbeams 
5197d8d0e25Snbeams //------------------------------------------------------------------------------
5207d8d0e25Snbeams // 3D transpose tensor contract z
5217d8d0e25Snbeams //------------------------------------------------------------------------------
5227d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5237d8d0e25Snbeams inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5247d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5257d8d0e25Snbeams     V[k] = 0.0;
5267d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
5277d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5287d8d0e25Snbeams         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
5297d8d0e25Snbeams   }
5307d8d0e25Snbeams }
5317d8d0e25Snbeams 
5327d8d0e25Snbeams //------------------------------------------------------------------------------
5337d8d0e25Snbeams // 3D transpose tensor contract y
5347d8d0e25Snbeams //------------------------------------------------------------------------------
5357d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5367d8d0e25Snbeams inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5377d8d0e25Snbeams   CeedScalar r_B[Q1d];
5387d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5397d8d0e25Snbeams     r_B[i] = B[data.tidy + i*P1d];
5407d8d0e25Snbeams 
5417d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5427d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5437d8d0e25Snbeams     __syncthreads();
5447d8d0e25Snbeams     V[k] = 0.0;
5457d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
5467d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5477d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
5487d8d0e25Snbeams     __syncthreads();
5497d8d0e25Snbeams   }
5507d8d0e25Snbeams }
5517d8d0e25Snbeams 
5527d8d0e25Snbeams //------------------------------------------------------------------------------
5537d8d0e25Snbeams // 3D transpose tensor contract add y
5547d8d0e25Snbeams //------------------------------------------------------------------------------
5557d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5567d8d0e25Snbeams inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5577d8d0e25Snbeams   CeedScalar r_B[Q1d];
5587d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5597d8d0e25Snbeams     r_B[i] = B[data.tidy + i*P1d];
5607d8d0e25Snbeams 
5617d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5627d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5637d8d0e25Snbeams     __syncthreads();
5647d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
5657d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5667d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
5677d8d0e25Snbeams     __syncthreads();
5687d8d0e25Snbeams   }
5697d8d0e25Snbeams }
5707d8d0e25Snbeams 
5717d8d0e25Snbeams //------------------------------------------------------------------------------
5727d8d0e25Snbeams // 3D transpose tensor contract x
5737d8d0e25Snbeams //------------------------------------------------------------------------------
5747d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5757d8d0e25Snbeams inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5767d8d0e25Snbeams   CeedScalar r_B[Q1d];
5777d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5787d8d0e25Snbeams     r_B[i] = B[data.tidx + i*P1d];
5797d8d0e25Snbeams 
5807d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5817d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5827d8d0e25Snbeams     __syncthreads();
5837d8d0e25Snbeams     V[k] = 0.0;
5847d8d0e25Snbeams     if (data.tidx < P1d && data.tidy < P1d)
5857d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5867d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
5877d8d0e25Snbeams     __syncthreads();
5887d8d0e25Snbeams   }
5897d8d0e25Snbeams }
5907d8d0e25Snbeams 
5917d8d0e25Snbeams //------------------------------------------------------------------------------
5927d8d0e25Snbeams // 3D transpose tensor contract add x
5937d8d0e25Snbeams //------------------------------------------------------------------------------
5947d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5957d8d0e25Snbeams inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5967d8d0e25Snbeams   CeedScalar r_B[Q1d];
5977d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5987d8d0e25Snbeams     r_B[i] = B[data.tidx + i*P1d];
5997d8d0e25Snbeams 
6007d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
6017d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
6027d8d0e25Snbeams     __syncthreads();
6037d8d0e25Snbeams     if (data.tidx < P1d && data.tidy < P1d)
6047d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6057d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
6067d8d0e25Snbeams     __syncthreads();
6077d8d0e25Snbeams   }
6087d8d0e25Snbeams }
6097d8d0e25Snbeams 
6107d8d0e25Snbeams //------------------------------------------------------------------------------
6117d8d0e25Snbeams // 3D interpolate to quadrature points
6127d8d0e25Snbeams //------------------------------------------------------------------------------
6137d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6147d8d0e25Snbeams inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
6157d8d0e25Snbeams   CeedScalar r_t1[T1d];
6167d8d0e25Snbeams   CeedScalar r_t2[T1d];
6177d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6187d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
6197d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6207d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
6217d8d0e25Snbeams   }
6227d8d0e25Snbeams }
6237d8d0e25Snbeams 
6247d8d0e25Snbeams //------------------------------------------------------------------------------
6257d8d0e25Snbeams // 3D interpolate transpose
6267d8d0e25Snbeams //------------------------------------------------------------------------------
6277d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6287d8d0e25Snbeams inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
6297d8d0e25Snbeams   CeedScalar r_t1[T1d];
6307d8d0e25Snbeams   CeedScalar r_t2[T1d];
6317d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6327d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
6337d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6347d8d0e25Snbeams     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6357d8d0e25Snbeams   }
6367d8d0e25Snbeams }
6377d8d0e25Snbeams 
6387d8d0e25Snbeams //------------------------------------------------------------------------------
6397d8d0e25Snbeams // 3D derivatives at quadrature points
6407d8d0e25Snbeams //------------------------------------------------------------------------------
6417d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6427d8d0e25Snbeams inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6437d8d0e25Snbeams   CeedScalar r_t1[T1d];
6447d8d0e25Snbeams   CeedScalar r_t2[T1d];
6457d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6467d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
6477d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6487d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
6497d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
6507d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
6517d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
6527d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
6537d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6547d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
6557d8d0e25Snbeams   }
6567d8d0e25Snbeams }
6577d8d0e25Snbeams 
6587d8d0e25Snbeams //------------------------------------------------------------------------------
6597d8d0e25Snbeams // 3D derivatives transpose
6607d8d0e25Snbeams //------------------------------------------------------------------------------
6617d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6627d8d0e25Snbeams inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6637d8d0e25Snbeams   CeedScalar r_t1[T1d];
6647d8d0e25Snbeams   CeedScalar r_t2[T1d];
6657d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6667d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
6677d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6687d8d0e25Snbeams     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
6697d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
6707d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
6717d8d0e25Snbeams     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6727d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
6737d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6747d8d0e25Snbeams     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6757d8d0e25Snbeams   }
6767d8d0e25Snbeams }
6777d8d0e25Snbeams 
6787d8d0e25Snbeams //------------------------------------------------------------------------------
6797d8d0e25Snbeams // 3D collocated derivatives computation
6807d8d0e25Snbeams //------------------------------------------------------------------------------
6817d8d0e25Snbeams template <int NCOMP, int Q1d>
6827d8d0e25Snbeams inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6837d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
6847d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
6857d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
6867d8d0e25Snbeams       __syncthreads();
6877d8d0e25Snbeams       // X derivative
6887d8d0e25Snbeams       r_V[comp+0*NCOMP] = 0.0;
6897d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6907d8d0e25Snbeams         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
6917d8d0e25Snbeams       // Y derivative
6927d8d0e25Snbeams       r_V[comp+1*NCOMP] = 0.0;
6937d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6947d8d0e25Snbeams         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
6957d8d0e25Snbeams       // Z derivative
6967d8d0e25Snbeams       r_V[comp+2*NCOMP] = 0.0;
6977d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6987d8d0e25Snbeams         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
6997d8d0e25Snbeams       __syncthreads();
7007d8d0e25Snbeams     }
7017d8d0e25Snbeams   }
7027d8d0e25Snbeams }
7037d8d0e25Snbeams 
7047d8d0e25Snbeams //------------------------------------------------------------------------------
7057d8d0e25Snbeams // 3D collocated derivatives transpose
7067d8d0e25Snbeams //------------------------------------------------------------------------------
7077d8d0e25Snbeams template <int NCOMP, int Q1d>
7087d8d0e25Snbeams inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
7097d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
7107d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
7117d8d0e25Snbeams       // X derivative
7127d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
7137d8d0e25Snbeams       __syncthreads();
7147d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
7157d8d0e25Snbeams         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
7167d8d0e25Snbeams       __syncthreads();
7177d8d0e25Snbeams       // Y derivative
7187d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
7197d8d0e25Snbeams       __syncthreads();
7207d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
7217d8d0e25Snbeams         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
7227d8d0e25Snbeams       __syncthreads();
7237d8d0e25Snbeams       // Z derivative
7247d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
7257d8d0e25Snbeams         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
7267d8d0e25Snbeams     }
7277d8d0e25Snbeams   }
7287d8d0e25Snbeams }
7297d8d0e25Snbeams 
7307d8d0e25Snbeams //------------------------------------------------------------------------------
7317d8d0e25Snbeams // 1D quadrature weights
7327d8d0e25Snbeams //------------------------------------------------------------------------------
7337d8d0e25Snbeams template <int Q1d>
7347d8d0e25Snbeams inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7357d8d0e25Snbeams   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
7367d8d0e25Snbeams }
7377d8d0e25Snbeams 
7387d8d0e25Snbeams //------------------------------------------------------------------------------
7397d8d0e25Snbeams // 2D quadrature weights
7407d8d0e25Snbeams //------------------------------------------------------------------------------
7417d8d0e25Snbeams template <int Q1d>
7427d8d0e25Snbeams inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7437d8d0e25Snbeams   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
7447d8d0e25Snbeams         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
7457d8d0e25Snbeams }
7467d8d0e25Snbeams 
7477d8d0e25Snbeams //------------------------------------------------------------------------------
7487d8d0e25Snbeams // 3D quadrature weights
7497d8d0e25Snbeams //------------------------------------------------------------------------------
7507d8d0e25Snbeams template <int Q1d>
7517d8d0e25Snbeams inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7527d8d0e25Snbeams   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
7537d8d0e25Snbeams   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
7547d8d0e25Snbeams   for (CeedInt z = 0; z < Q1d; ++z)
7557d8d0e25Snbeams     w[z] = quad ? pw*qweight1d[z] : 0.0;
7567d8d0e25Snbeams }
7577d8d0e25Snbeams 
7587d8d0e25Snbeams );
7597d8d0e25Snbeams //------------------------------------------------------------------------------
7607d8d0e25Snbeams // Build singe operator kernel
7617d8d0e25Snbeams //------------------------------------------------------------------------------
762680aa83fSnbeams CEED_INTERN int CeedHipGenOperatorBuild(CeedOperator op) {
7637d8d0e25Snbeams 
7647d8d0e25Snbeams   using std::ostringstream;
7657d8d0e25Snbeams   using std::string;
7667d8d0e25Snbeams   int ierr;
7677d8d0e25Snbeams   bool setupdone;
768e15f9bd0SJeremy L Thompson   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
769e15f9bd0SJeremy L Thompson   if (setupdone) return CEED_ERROR_SUCCESS;
7707d8d0e25Snbeams   Ceed ceed;
771e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
7727d8d0e25Snbeams   CeedOperator_Hip_gen *data;
773e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
7747d8d0e25Snbeams   CeedQFunction qf;
7757d8d0e25Snbeams   CeedQFunction_Hip_gen *qf_data;
776e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
777e15f9bd0SJeremy L Thompson   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
77878464608Sjeremylt   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
7797d8d0e25Snbeams           numoutputfields, ncomp, dim = 0, lsize;
780e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
781e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
7827d8d0e25Snbeams   CeedOperatorField *opinputfields, *opoutputfields;
7837e7773b5SJeremy L Thompson   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
784e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
7857d8d0e25Snbeams   CeedQFunctionField *qfinputfields, *qfoutputfields;
7867e7773b5SJeremy L Thompson   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
787e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
7887d8d0e25Snbeams   CeedEvalMode emode;
7897d8d0e25Snbeams   CeedBasis basis;
7907d8d0e25Snbeams   CeedBasis_Hip_shared *basis_data;
7917d8d0e25Snbeams   CeedElemRestriction Erestrict;
7927d8d0e25Snbeams   CeedElemRestriction_Hip *restr_data;
7937d8d0e25Snbeams 
7940b454692Sjeremylt   // Check for restriction only identity operator
7950b454692Sjeremylt   bool is_identity_qf;
7960b454692Sjeremylt   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
7970b454692Sjeremylt   if (is_identity_qf) {
7980b454692Sjeremylt     CeedEvalMode emodein, emodeout;
7990b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
8000b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
8010b454692Sjeremylt     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
8020b454692Sjeremylt       // LCOV_EXCL_START
8030b454692Sjeremylt       return CeedError(ceed, CEED_ERROR_BACKEND,
8040b454692Sjeremylt                        "Backend does not implement restriction only identity operators");
8050b454692Sjeremylt     // LCOV_EXCL_STOP
8060b454692Sjeremylt   }
8070b454692Sjeremylt 
8087d8d0e25Snbeams   ostringstream code;
8097d8d0e25Snbeams   string devFunctions(deviceFunctions);
8107d8d0e25Snbeams   code << devFunctions;
8117d8d0e25Snbeams 
8127d8d0e25Snbeams   string qFunction(qf_data->qFunctionSource);
8137d8d0e25Snbeams   string qFunctionName(qf_data->qFunctionName);
8147d8d0e25Snbeams   string oper;
8157d8d0e25Snbeams   oper = "CeedKernel_Hip_gen_" + qFunctionName;
8167d8d0e25Snbeams 
8177d8d0e25Snbeams   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
81877841947SLeila Ghaffari   code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n";
819e15f9bd0SJeremy L Thompson   code << "#define CeedPragmaSIMD\n";
820e15f9bd0SJeremy L Thompson   code << "#define CEED_ERROR_SUCCESS 0\n\n";
8217d8d0e25Snbeams 
8227d8d0e25Snbeams   // Find dim and Q1d
8237d8d0e25Snbeams   bool useCollograd = true;
8247d8d0e25Snbeams   data->maxP1d = 0;
8257d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
826e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
8277d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
828e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8297d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
830e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8317d8d0e25Snbeams 
8327d8d0e25Snbeams       // Check for collocated gradient
833437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8347d8d0e25Snbeams 
8357d8d0e25Snbeams       // Collect dim and Q1d
836e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8377d8d0e25Snbeams       bool isTensor;
838e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8397d8d0e25Snbeams       if (isTensor) {
840e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
841e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
8427d8d0e25Snbeams         if (P1d>data->maxP1d) data->maxP1d = P1d;
8437d8d0e25Snbeams       } else {
8447d8d0e25Snbeams         // LCOV_EXCL_START
845e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
8467d8d0e25Snbeams         // LCOV_EXCL_STOP
8477d8d0e25Snbeams         }
8487d8d0e25Snbeams     }
8497d8d0e25Snbeams   }
8507d8d0e25Snbeams   // Check output bases for Q1d, dim as well
8517d8d0e25Snbeams   //   The only imput basis might be CEED_BASIS_COLLOCATED
8527d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
853e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
8547d8d0e25Snbeams 
8557d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
856e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8577d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
858e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8597d8d0e25Snbeams 
8607d8d0e25Snbeams       // Collect dim and Q1d
861e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8627d8d0e25Snbeams       bool isTensor;
863e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8647d8d0e25Snbeams       if (isTensor) {
865e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
8667d8d0e25Snbeams       } else {
8677d8d0e25Snbeams         // LCOV_EXCL_START
868e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
8697d8d0e25Snbeams         // LCOV_EXCL_STOP
8707d8d0e25Snbeams         }
8717d8d0e25Snbeams 
8727d8d0e25Snbeams       // Check for collocated gradient
873437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8747d8d0e25Snbeams     }
8757d8d0e25Snbeams   }
8767d8d0e25Snbeams   data->dim = dim;
8777d8d0e25Snbeams   data->Q1d = Q1d;
8787d8d0e25Snbeams 
8797d8d0e25Snbeams   // Define CEED_Q_VLA
8807d8d0e25Snbeams   if (dim != 3 || useCollograd) {
8817d8d0e25Snbeams     code << "\n#define CEED_Q_VLA 1\n\n";
8827d8d0e25Snbeams   } else {
8837d8d0e25Snbeams     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
8847d8d0e25Snbeams   }
8857d8d0e25Snbeams 
8867d8d0e25Snbeams   code << qFunction;
8877d8d0e25Snbeams 
8887d8d0e25Snbeams   // Setup
8897d8d0e25Snbeams   code << "\n// -----------------------------------------------------------------------------\n";
890*b3e1519bSnbeams   code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
891*b3e1519bSnbeams   code << "__global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
8927d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
8937d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
894e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8957d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
8967d8d0e25Snbeams       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
8977d8d0e25Snbeams     }
8987d8d0e25Snbeams   }
8997d8d0e25Snbeams 
9007d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
9017d8d0e25Snbeams     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
9027d8d0e25Snbeams   }
9037d8d0e25Snbeams 
9047d8d0e25Snbeams   code << "  const CeedInt Dim = "<<dim<<";\n";
9057d8d0e25Snbeams   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
9067d8d0e25Snbeams 
9077d8d0e25Snbeams   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
9087d8d0e25Snbeams   code << "  BackendData data;\n";
9097d8d0e25Snbeams   code << "  data.tidx = threadIdx.x;\n";
9107d8d0e25Snbeams   code << "  data.tidy = threadIdx.y;\n";
9117d8d0e25Snbeams   code << "  data.tidz = threadIdx.z;\n";
9127d8d0e25Snbeams   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
9137d8d0e25Snbeams   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
9147d8d0e25Snbeams 
9157d8d0e25Snbeams   code << "\n  // -- Input field constants and basis data --\n";
9167d8d0e25Snbeams   //Initialize constants, and matrices B and G
9177d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
9187d8d0e25Snbeams     code << "  // ---- Input field "<<i<<" ----\n";
9197d8d0e25Snbeams     // Get elemsize, emode, ncomp
9207d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
921e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9227d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
923e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9247d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
925e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9267d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
927e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9287d8d0e25Snbeams 
9297d8d0e25Snbeams     // Set field constants
9307d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT) {
931e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
9327d8d0e25Snbeams       if (basis != CEED_BASIS_COLLOCATED) {
933e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
9347d8d0e25Snbeams         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
9357d8d0e25Snbeams       } else {
9367d8d0e25Snbeams         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
9377d8d0e25Snbeams       }
9387d8d0e25Snbeams       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
9397d8d0e25Snbeams     }
9407d8d0e25Snbeams 
9417d8d0e25Snbeams     // Load basis data
9427d8d0e25Snbeams     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
9437d8d0e25Snbeams     switch (emode) {
9447d8d0e25Snbeams     case CEED_EVAL_NONE:
9457d8d0e25Snbeams       break;
9467d8d0e25Snbeams     case CEED_EVAL_INTERP:
947e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
948437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
94980a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9507d8d0e25Snbeams       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
9517d8d0e25Snbeams       break;
9527d8d0e25Snbeams     case CEED_EVAL_GRAD:
953e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
954437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
95580a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9567d8d0e25Snbeams       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
9577d8d0e25Snbeams       if (useCollograd) {
958437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_collo_grad_1d;
95980a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
9607d8d0e25Snbeams         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
9617d8d0e25Snbeams       } else {
962437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_grad_1d;
96380a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9647d8d0e25Snbeams         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
9657d8d0e25Snbeams       }
9667d8d0e25Snbeams       break;
9677d8d0e25Snbeams     case CEED_EVAL_WEIGHT:
9687d8d0e25Snbeams       break; // No action
9697d8d0e25Snbeams     case CEED_EVAL_DIV:
9707d8d0e25Snbeams       break; // TODO: Not implemented
9717d8d0e25Snbeams     case CEED_EVAL_CURL:
9727d8d0e25Snbeams       break; // TODO: Not implemented
9737d8d0e25Snbeams     }
9747d8d0e25Snbeams   }
9757d8d0e25Snbeams 
9767d8d0e25Snbeams   code << "\n  // -- Output field constants and basis data --\n";
9777d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
9787d8d0e25Snbeams     code << "  // ---- Output field "<<i<<" ----\n";
9797d8d0e25Snbeams     // Get elemsize, emode, ncomp
9807d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
981e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9827d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
983e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9847d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
985e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9867d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
987e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9887d8d0e25Snbeams 
9897d8d0e25Snbeams     // Set field constants
990e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
9917d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
992e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
9937d8d0e25Snbeams       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
9947d8d0e25Snbeams     } else {
9957d8d0e25Snbeams       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
9967d8d0e25Snbeams     }
9977d8d0e25Snbeams     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
9987d8d0e25Snbeams 
9997d8d0e25Snbeams     // Load basis data
10007d8d0e25Snbeams     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
10017d8d0e25Snbeams     switch (emode) {
10027d8d0e25Snbeams     case CEED_EVAL_NONE:
10037d8d0e25Snbeams       break; // No action
10047d8d0e25Snbeams     case CEED_EVAL_INTERP:
1005e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1006437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
100780a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
10087d8d0e25Snbeams       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
10097d8d0e25Snbeams       break;
10107d8d0e25Snbeams     case CEED_EVAL_GRAD:
1011e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1012437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
101380a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
10147d8d0e25Snbeams       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
10157d8d0e25Snbeams       if (useCollograd) {
1016437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_collo_grad_1d;
101780a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
10187d8d0e25Snbeams         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
10197d8d0e25Snbeams       } else {
1020437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_grad_1d;
102180a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
10227d8d0e25Snbeams         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
10237d8d0e25Snbeams       }
10247d8d0e25Snbeams       break;
10257d8d0e25Snbeams     // LCOV_EXCL_START
10267d8d0e25Snbeams     case CEED_EVAL_WEIGHT: {
10277d8d0e25Snbeams       Ceed ceed;
1028e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1029e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
10307d8d0e25Snbeams                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
10317d8d0e25Snbeams       break; // Should not occur
10327d8d0e25Snbeams     }
10337d8d0e25Snbeams     case CEED_EVAL_DIV:
10347d8d0e25Snbeams       break; // TODO: Not implemented
10357d8d0e25Snbeams     case CEED_EVAL_CURL:
10367d8d0e25Snbeams       break; // TODO: Not implemented
10377d8d0e25Snbeams       // LCOV_EXCL_STOP
10387d8d0e25Snbeams     }
10397d8d0e25Snbeams   }
10407d8d0e25Snbeams   code << "\n  // -- Element loop --\n";
10417d8d0e25Snbeams   code << "  __syncthreads();\n";
10427d8d0e25Snbeams   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
10437d8d0e25Snbeams   // Input basis apply if needed
10447d8d0e25Snbeams   // Generate the correct eval mode code for each input
10457d8d0e25Snbeams   code << "    // -- Input field restrictions and basis actions --\n";
10467d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
10477d8d0e25Snbeams     code << "    // ---- Input field "<<i<<" ----\n";
10487d8d0e25Snbeams     // Get elemsize, emode, ncomp
10497d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1050e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10517d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1052e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10537d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1054e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10557d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1056e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10577d8d0e25Snbeams 
10587d8d0e25Snbeams     // Restriction
10597d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT &&
10607d8d0e25Snbeams         !((emode == CEED_EVAL_NONE) && useCollograd)) {
10617d8d0e25Snbeams       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
10627d8d0e25Snbeams 
10637d8d0e25Snbeams       bool isStrided;
1064e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
10657d8d0e25Snbeams       if (!isStrided) {
10667d8d0e25Snbeams         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1067e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10687d8d0e25Snbeams         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
10697d8d0e25Snbeams         CeedInt compstride;
1070e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
10717d8d0e25Snbeams         code << "    // CompStride: "<<compstride<<"\n";
1072e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
10737d8d0e25Snbeams         data->indices.in[i] = restr_data->d_ind;
10747d8d0e25Snbeams         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
10757d8d0e25Snbeams       } else {
10767d8d0e25Snbeams         bool backendstrides;
10777d8d0e25Snbeams         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1078e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10797d8d0e25Snbeams         CeedInt nelem;
10807d8d0e25Snbeams         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1081e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10827d8d0e25Snbeams         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
10837d8d0e25Snbeams         if (!backendstrides) {
10847d8d0e25Snbeams           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1085e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
10867d8d0e25Snbeams         }
10877d8d0e25Snbeams         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
10887d8d0e25Snbeams         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
10897d8d0e25Snbeams       }
10907d8d0e25Snbeams     }
10917d8d0e25Snbeams 
10927d8d0e25Snbeams     // Basis action
10937d8d0e25Snbeams     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
10947d8d0e25Snbeams     switch (emode) {
10957d8d0e25Snbeams     case CEED_EVAL_NONE:
10967d8d0e25Snbeams       if (!useCollograd) {
10977d8d0e25Snbeams         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
10987d8d0e25Snbeams       }
10997d8d0e25Snbeams       break;
11007d8d0e25Snbeams     case CEED_EVAL_INTERP:
11017d8d0e25Snbeams       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
11027d8d0e25Snbeams       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
11037d8d0e25Snbeams       break;
11047d8d0e25Snbeams     case CEED_EVAL_GRAD:
11057d8d0e25Snbeams       if (useCollograd) {
11067d8d0e25Snbeams         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
11077d8d0e25Snbeams         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
11087d8d0e25Snbeams       } else {
11097d8d0e25Snbeams         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
11107d8d0e25Snbeams         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
11117d8d0e25Snbeams       }
11127d8d0e25Snbeams       break;
11137d8d0e25Snbeams     case CEED_EVAL_WEIGHT:
11147d8d0e25Snbeams       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1115e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1116e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1117437930d1SJeremy L Thompson       data->W = basis_data->d_q_weight_1d;
11187d8d0e25Snbeams       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
11197d8d0e25Snbeams       break; // No action
11207d8d0e25Snbeams     case CEED_EVAL_DIV:
11217d8d0e25Snbeams       break; // TODO: Not implemented
11227d8d0e25Snbeams     case CEED_EVAL_CURL:
11237d8d0e25Snbeams       break; // TODO: Not implemented
11247d8d0e25Snbeams     }
11257d8d0e25Snbeams   }
11267d8d0e25Snbeams 
11277d8d0e25Snbeams   // Q function
11287d8d0e25Snbeams   code << "\n    // -- Output field setup --\n";
11297d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
11307d8d0e25Snbeams       code << "\n    // ---- Output field "<<i<<" ----\n";
11317d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1132e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
11337d8d0e25Snbeams     if (emode==CEED_EVAL_GRAD)
11347d8d0e25Snbeams     {
11357d8d0e25Snbeams       if (useCollograd) {
11367d8d0e25Snbeams         //Accumulator for gradient slices
11377d8d0e25Snbeams         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
11387d8d0e25Snbeams         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
11397d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
11407d8d0e25Snbeams         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
11417d8d0e25Snbeams         code << "      }\n";
11427d8d0e25Snbeams         code << "    }\n";
11437d8d0e25Snbeams       } else {
11447d8d0e25Snbeams         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
11457d8d0e25Snbeams       }
11467d8d0e25Snbeams     }
11477d8d0e25Snbeams     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
11487d8d0e25Snbeams     {
11497d8d0e25Snbeams       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
11507d8d0e25Snbeams     }
11517d8d0e25Snbeams   }
11527d8d0e25Snbeams   // We treat quadrature points per slice in 3d to save registers
11537d8d0e25Snbeams   if (useCollograd) {
11547d8d0e25Snbeams     code << "\n    // Note: Collocated Gradient\n";
11557d8d0e25Snbeams     code << "#pragma unroll\n";
11567d8d0e25Snbeams     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
11577d8d0e25Snbeams     code << "      // -- Input fields --\n";
11587d8d0e25Snbeams     for (CeedInt i = 0; i < numinputfields; i++) {
11597d8d0e25Snbeams       code << "      // ---- Input field "<<i<<" ----\n";
11607d8d0e25Snbeams       // Get elemsize, emode, ncomp
11617d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1162e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
11637d8d0e25Snbeams       // Basis action
11647d8d0e25Snbeams       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
11657d8d0e25Snbeams       switch (emode) {
11667d8d0e25Snbeams       case CEED_EVAL_NONE:
11677d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11687d8d0e25Snbeams 
11697d8d0e25Snbeams         bool isStrided;
1170e15f9bd0SJeremy L Thompson         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1171e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1172e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
11737d8d0e25Snbeams         if (!isStrided) {
11747d8d0e25Snbeams           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1175e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11767d8d0e25Snbeams           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
11777d8d0e25Snbeams           CeedInt compstride;
1178e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
11797d8d0e25Snbeams           code << "      // CompStride: "<<compstride<<"\n";
1180e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
11817d8d0e25Snbeams           data->indices.in[i] = restr_data->d_ind;
11827d8d0e25Snbeams           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
11837d8d0e25Snbeams         } else {
11847d8d0e25Snbeams           bool backendstrides;
11857d8d0e25Snbeams           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1186e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11877d8d0e25Snbeams           CeedInt nelem;
11887d8d0e25Snbeams           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1189e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11907d8d0e25Snbeams           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
11917d8d0e25Snbeams           if (!backendstrides) {
11927d8d0e25Snbeams             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1193e15f9bd0SJeremy L Thompson             CeedChkBackend(ierr);
11947d8d0e25Snbeams           }
11957d8d0e25Snbeams           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
11967d8d0e25Snbeams           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
11977d8d0e25Snbeams         }
11987d8d0e25Snbeams         break;
11997d8d0e25Snbeams       case CEED_EVAL_INTERP:
12007d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
12017d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
12027d8d0e25Snbeams         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
12037d8d0e25Snbeams         code << "      }\n";
12047d8d0e25Snbeams         break;
12057d8d0e25Snbeams       case CEED_EVAL_GRAD:
12067d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
12077d8d0e25Snbeams         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
12087d8d0e25Snbeams         break;
12097d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
12107d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[1];\n";
12117d8d0e25Snbeams         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
12127d8d0e25Snbeams         break; // No action
12137d8d0e25Snbeams       case CEED_EVAL_DIV:
12147d8d0e25Snbeams         break; // TODO: Not implemented
12157d8d0e25Snbeams       case CEED_EVAL_CURL:
12167d8d0e25Snbeams         break; // TODO: Not implemented
12177d8d0e25Snbeams       }
12187d8d0e25Snbeams     }
12197d8d0e25Snbeams     code << "\n      // -- Output fields --\n";
12207d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
12217d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
12227d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1223e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
12247d8d0e25Snbeams       // Basis action
12257d8d0e25Snbeams       switch (emode) {
12267d8d0e25Snbeams       case CEED_EVAL_NONE:
12277d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
12287d8d0e25Snbeams         break; // No action
12297d8d0e25Snbeams       case CEED_EVAL_INTERP:
12307d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
12317d8d0e25Snbeams         break;
12327d8d0e25Snbeams       case CEED_EVAL_GRAD:
12337d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
12347d8d0e25Snbeams         break;
12357d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
12367d8d0e25Snbeams         break; // Should not occur
12377d8d0e25Snbeams       case CEED_EVAL_DIV:
12387d8d0e25Snbeams         break; // TODO: Not implemented
12397d8d0e25Snbeams       case CEED_EVAL_CURL:
12407d8d0e25Snbeams         break; // TODO: Not implemented
12417d8d0e25Snbeams       }
12427d8d0e25Snbeams     }
12437d8d0e25Snbeams   } else {
12447d8d0e25Snbeams     code << "\n      // Note: No Collocated Gradient\n";
12457d8d0e25Snbeams     code << "      // -- Input fields --\n";
12467d8d0e25Snbeams     for (CeedInt i = 0; i < numinputfields; i++) {
12477d8d0e25Snbeams       code << "      // ---- Input field "<<i<<" ----\n";
12487d8d0e25Snbeams       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
12497d8d0e25Snbeams     }
12507d8d0e25Snbeams     code << "      // -- Output fields --\n";
12517d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
12527d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
12537d8d0e25Snbeams       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
12547d8d0e25Snbeams     }
12557d8d0e25Snbeams   }
12567d8d0e25Snbeams   code << "\n      // -- QFunction Inputs and outputs --\n";
12577d8d0e25Snbeams   code << "      CeedScalar* in["<<numinputfields<<"];\n";
12587d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
12597d8d0e25Snbeams     code << "      // ---- Input field "<<i<<" ----\n";
12607d8d0e25Snbeams     code << "      in["<<i<<"] = r_q"<<i<<";\n";
12617d8d0e25Snbeams   }
12627d8d0e25Snbeams   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
12637d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
12647d8d0e25Snbeams     code << "      // ---- Output field "<<i<<" ----\n";
12657d8d0e25Snbeams     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
12667d8d0e25Snbeams   }
12677d8d0e25Snbeams   code << "\n      // -- Apply QFunction --\n";
12687d8d0e25Snbeams   code << "      "<<qFunctionName<<"(ctx, ";
12697d8d0e25Snbeams   if (dim != 3 || useCollograd) {
12707d8d0e25Snbeams     code << "1";
12717d8d0e25Snbeams   } else {
12727d8d0e25Snbeams     code << "Q1d";
12737d8d0e25Snbeams   }
12747d8d0e25Snbeams   code << ", in, out);\n";
12757d8d0e25Snbeams   if (useCollograd) {
12767d8d0e25Snbeams     code << "\n      // Note: Collocated Gradient\n";
12777d8d0e25Snbeams     code << "      // -- Output fields --\n";
12787d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
12797d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
12807d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1281e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
12827d8d0e25Snbeams       // Basis action
12837d8d0e25Snbeams       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
12847d8d0e25Snbeams       switch (emode) {
12857d8d0e25Snbeams       case CEED_EVAL_NONE:
12867d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
12877d8d0e25Snbeams         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
12887d8d0e25Snbeams         code << "      }\n";
12897d8d0e25Snbeams         break; // No action
12907d8d0e25Snbeams       case CEED_EVAL_INTERP:
12917d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
12927d8d0e25Snbeams         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
12937d8d0e25Snbeams         code << "      }\n";
12947d8d0e25Snbeams         break;
12957d8d0e25Snbeams       case CEED_EVAL_GRAD:
12967d8d0e25Snbeams         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
12977d8d0e25Snbeams         break;
12987d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
12997d8d0e25Snbeams         break; // Should not occur
13007d8d0e25Snbeams       case CEED_EVAL_DIV:
13017d8d0e25Snbeams         break; // TODO: Not implemented
13027d8d0e25Snbeams       case CEED_EVAL_CURL:
13037d8d0e25Snbeams         break; // TODO: Not implemented
13047d8d0e25Snbeams       }
13057d8d0e25Snbeams     }
13067d8d0e25Snbeams     code << "    }\n";
13077d8d0e25Snbeams   }
13087d8d0e25Snbeams 
13097d8d0e25Snbeams   // Output basis apply if needed
13107d8d0e25Snbeams   // Generate the correct eval mode code for each output
13117d8d0e25Snbeams   code << "\n    // -- Output field basis action and restrictions --\n";
13127d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
13137d8d0e25Snbeams     code << "    // ---- Output field "<<i<<" ----\n";
13147d8d0e25Snbeams     // Get elemsize, emode, ncomp
13157d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1316e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13177d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1318e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13197d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1320e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13217d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1322e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13237d8d0e25Snbeams     // Basis action
13247d8d0e25Snbeams     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
13257d8d0e25Snbeams     switch (emode) {
13267d8d0e25Snbeams     case CEED_EVAL_NONE:
13277d8d0e25Snbeams       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
13287d8d0e25Snbeams       break; // No action
13297d8d0e25Snbeams     case CEED_EVAL_INTERP:
13307d8d0e25Snbeams       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
13317d8d0e25Snbeams       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
13327d8d0e25Snbeams       break;
13337d8d0e25Snbeams     case CEED_EVAL_GRAD:
13347d8d0e25Snbeams       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
13357d8d0e25Snbeams       if (useCollograd) {
13367d8d0e25Snbeams         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
13377d8d0e25Snbeams       } else {
13387d8d0e25Snbeams         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
13397d8d0e25Snbeams       }
13407d8d0e25Snbeams       break;
13417d8d0e25Snbeams     // LCOV_EXCL_START
13427d8d0e25Snbeams     case CEED_EVAL_WEIGHT: {
13437d8d0e25Snbeams       Ceed ceed;
1344e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1345e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
13467d8d0e25Snbeams                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
13477d8d0e25Snbeams       break; // Should not occur
13487d8d0e25Snbeams     }
13497d8d0e25Snbeams     case CEED_EVAL_DIV:
13507d8d0e25Snbeams       break; // TODO: Not implemented
13517d8d0e25Snbeams     case CEED_EVAL_CURL:
13527d8d0e25Snbeams       break; // TODO: Not implemented
13537d8d0e25Snbeams       // LCOV_EXCL_STOP
13547d8d0e25Snbeams     }
13557d8d0e25Snbeams     // Restriction
13567d8d0e25Snbeams       bool isStrided;
1357e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
13587d8d0e25Snbeams     if (!isStrided) {
13597d8d0e25Snbeams       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1360e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13617d8d0e25Snbeams       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
13627d8d0e25Snbeams       CeedInt compstride;
1363e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
13647d8d0e25Snbeams       code << "    // CompStride: "<<compstride<<"\n";
1365e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
13667d8d0e25Snbeams       data->indices.out[i] = restr_data->d_ind;
13677d8d0e25Snbeams       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
13687d8d0e25Snbeams     } else {
13697d8d0e25Snbeams       bool backendstrides;
13707d8d0e25Snbeams       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1371e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13727d8d0e25Snbeams       CeedInt nelem;
13737d8d0e25Snbeams       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1374e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13757d8d0e25Snbeams       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
13767d8d0e25Snbeams       if (!backendstrides) {
13777d8d0e25Snbeams         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1378e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
13797d8d0e25Snbeams       }
13807d8d0e25Snbeams       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
13817d8d0e25Snbeams       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
13827d8d0e25Snbeams     }
13837d8d0e25Snbeams   }
13847d8d0e25Snbeams 
13857d8d0e25Snbeams   code << "  }\n";
13867d8d0e25Snbeams   code << "}\n";
13877d8d0e25Snbeams   code << "// -----------------------------------------------------------------------------\n\n";
13887d8d0e25Snbeams 
13897d8d0e25Snbeams   // View kernel for debugging
139046dc0734SJeremy L Thompson   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
13913f21f6b1SJeremy L Thompson   CeedDebug(ceed, code.str().c_str());
13927d8d0e25Snbeams 
1393*b3e1519bSnbeams   CeedInt block_sizes[3];
1394*b3e1519bSnbeams   ierr = BlockGridCalculate(dim, numelements, data->maxP1d, Q1d, block_sizes);
1395*b3e1519bSnbeams   CeedChkBackend(ierr);
1396*b3e1519bSnbeams   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2,
1397*b3e1519bSnbeams                          "T1d", block_sizes[0],
1398*b3e1519bSnbeams 			 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]);
1399e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
14007d8d0e25Snbeams   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1401e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
14027d8d0e25Snbeams 
1403e15f9bd0SJeremy L Thompson   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1404e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
14057d8d0e25Snbeams }
14067d8d0e25Snbeams //------------------------------------------------------------------------------
1407