xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 46dc07349c44057b9efae59cd6a2d41f419237bd)
17d8d0e25Snbeams // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
27d8d0e25Snbeams // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
37d8d0e25Snbeams // All Rights reserved. See files LICENSE and NOTICE for details.
47d8d0e25Snbeams //
57d8d0e25Snbeams // This file is part of CEED, a collection of benchmarks, miniapps, software
67d8d0e25Snbeams // libraries and APIs for efficient high-order finite element and spectral
77d8d0e25Snbeams // element discretizations for exascale applications. For more information and
87d8d0e25Snbeams // source code availability see http://github.com/ceed.
97d8d0e25Snbeams //
107d8d0e25Snbeams // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
117d8d0e25Snbeams // a collaborative effort of two U.S. Department of Energy organizations (Office
127d8d0e25Snbeams // of Science and the National Nuclear Security Administration) responsible for
137d8d0e25Snbeams // the planning and preparation of a capable exascale ecosystem, including
147d8d0e25Snbeams // software, applications, hardware, advanced system engineering and early
157d8d0e25Snbeams // testbed platforms, in support of the nation's exascale computing imperative.
163d576824SJeremy L Thompson 
177d8d0e25Snbeams #define CEED_DEBUG_COLOR 12
187d8d0e25Snbeams 
19ec3da8bcSJed Brown #include <ceed/ceed.h>
20ec3da8bcSJed Brown #include <ceed/backend.h>
217d8d0e25Snbeams #include <iostream>
223d576824SJeremy L Thompson #include <string>
237d8d0e25Snbeams #include <sstream>
243d576824SJeremy L Thompson #include "ceed-hip-gen.h"
250d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h"
267d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h"
277d8d0e25Snbeams #include "../hip/ceed-hip-compile.h"
287d8d0e25Snbeams 
297d8d0e25Snbeams static const char *deviceFunctions = QUOTE(
307d8d0e25Snbeams 
317d8d0e25Snbeams //------------------------------------------------------------------------------
327d8d0e25Snbeams // Typedefs
337d8d0e25Snbeams //------------------------------------------------------------------------------
347d8d0e25Snbeams typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
357d8d0e25Snbeams typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
367d8d0e25Snbeams 
377d8d0e25Snbeams typedef struct {
387d8d0e25Snbeams   CeedInt tidx;
397d8d0e25Snbeams   CeedInt tidy;
407d8d0e25Snbeams   CeedInt tidz;
417d8d0e25Snbeams   CeedInt tid;
427d8d0e25Snbeams   CeedScalar* slice;
437d8d0e25Snbeams } BackendData;
447d8d0e25Snbeams 
457d8d0e25Snbeams //------------------------------------------------------------------------------
467d8d0e25Snbeams // Load matrices for basis actions
477d8d0e25Snbeams //------------------------------------------------------------------------------
487d8d0e25Snbeams template <int P, int Q>
497d8d0e25Snbeams inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
507d8d0e25Snbeams   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
517d8d0e25Snbeams     B[i] = d_B[i];
527d8d0e25Snbeams }
537d8d0e25Snbeams 
547d8d0e25Snbeams //------------------------------------------------------------------------------
557d8d0e25Snbeams // 1D
567d8d0e25Snbeams //------------------------------------------------------------------------------
577d8d0e25Snbeams 
587d8d0e25Snbeams //------------------------------------------------------------------------------
597d8d0e25Snbeams // L-vector -> E-vector, offsets provided
607d8d0e25Snbeams //------------------------------------------------------------------------------
617d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
627d8d0e25Snbeams inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
637d8d0e25Snbeams   if (data.tidx < P1d) {
647d8d0e25Snbeams     const CeedInt node = data.tidx;
657d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d];
667d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
677d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
687d8d0e25Snbeams   }
697d8d0e25Snbeams }
707d8d0e25Snbeams 
717d8d0e25Snbeams //------------------------------------------------------------------------------
727d8d0e25Snbeams // L-vector -> E-vector, strided
737d8d0e25Snbeams //------------------------------------------------------------------------------
747d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
757d8d0e25Snbeams inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
767d8d0e25Snbeams   if (data.tidx < P1d) {
777d8d0e25Snbeams     const CeedInt node = data.tidx;
787d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
797d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
807d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
817d8d0e25Snbeams   }
827d8d0e25Snbeams }
837d8d0e25Snbeams 
847d8d0e25Snbeams //------------------------------------------------------------------------------
857d8d0e25Snbeams // E-vector -> L-vector, offsets provided
867d8d0e25Snbeams //------------------------------------------------------------------------------
877d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
887d8d0e25Snbeams inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
897d8d0e25Snbeams   if (data.tidx < P1d) {
907d8d0e25Snbeams     const CeedInt node = data.tidx;
917d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d];
927d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
937d8d0e25Snbeams       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
947d8d0e25Snbeams   }
957d8d0e25Snbeams }
967d8d0e25Snbeams 
977d8d0e25Snbeams //------------------------------------------------------------------------------
987d8d0e25Snbeams // E-vector -> L-vector, strided
997d8d0e25Snbeams //------------------------------------------------------------------------------
1007d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
1017d8d0e25Snbeams inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
1027d8d0e25Snbeams   if (data.tidx < P1d) {
1037d8d0e25Snbeams     const CeedInt node = data.tidx;
1047d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
1057d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1067d8d0e25Snbeams       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
1077d8d0e25Snbeams   }
1087d8d0e25Snbeams }
1097d8d0e25Snbeams 
1107d8d0e25Snbeams //------------------------------------------------------------------------------
1117d8d0e25Snbeams // 1D tensor contraction x
1127d8d0e25Snbeams //------------------------------------------------------------------------------
1137d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1147d8d0e25Snbeams inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1157d8d0e25Snbeams   data.slice[data.tidx] = *U;
1167d8d0e25Snbeams   __syncthreads();
1177d8d0e25Snbeams   *V = 0.0;
1187d8d0e25Snbeams   if (data.tidx < Q1d)
1197d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
1207d8d0e25Snbeams       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
1217d8d0e25Snbeams   __syncthreads();
1227d8d0e25Snbeams }
1237d8d0e25Snbeams 
1247d8d0e25Snbeams //------------------------------------------------------------------------------
1257d8d0e25Snbeams // 1D transpose tensor contraction x
1267d8d0e25Snbeams //------------------------------------------------------------------------------
1277d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1287d8d0e25Snbeams inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1297d8d0e25Snbeams   data.slice[data.tidx] = *U;
1307d8d0e25Snbeams   __syncthreads();
1317d8d0e25Snbeams   *V = 0.0;
1327d8d0e25Snbeams   if (data.tidx < P1d)
1337d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
1347d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
1357d8d0e25Snbeams   __syncthreads();
1367d8d0e25Snbeams }
1377d8d0e25Snbeams 
1387d8d0e25Snbeams //------------------------------------------------------------------------------
1397d8d0e25Snbeams // 1D interpolate to quadrature points
1407d8d0e25Snbeams //------------------------------------------------------------------------------
1417d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1427d8d0e25Snbeams inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
1437d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
1447d8d0e25Snbeams     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
1457d8d0e25Snbeams }
1467d8d0e25Snbeams 
1477d8d0e25Snbeams //------------------------------------------------------------------------------
1487d8d0e25Snbeams // 1D interpolate transpose
1497d8d0e25Snbeams //------------------------------------------------------------------------------
1507d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1517d8d0e25Snbeams inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
1527d8d0e25Snbeams   for (CeedInt comp=0; comp<NCOMP; comp++)
1537d8d0e25Snbeams     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
1547d8d0e25Snbeams }
1557d8d0e25Snbeams 
1567d8d0e25Snbeams //------------------------------------------------------------------------------
1577d8d0e25Snbeams // 1D derivatives at quadrature points
1587d8d0e25Snbeams //------------------------------------------------------------------------------
1597d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1607d8d0e25Snbeams inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
1617d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
1627d8d0e25Snbeams     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
1637d8d0e25Snbeams }
1647d8d0e25Snbeams 
1657d8d0e25Snbeams //------------------------------------------------------------------------------
1667d8d0e25Snbeams // 1D derivatives transpose
1677d8d0e25Snbeams //------------------------------------------------------------------------------
1687d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1697d8d0e25Snbeams inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
1707d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
1717d8d0e25Snbeams     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
1727d8d0e25Snbeams }
1737d8d0e25Snbeams 
1747d8d0e25Snbeams //------------------------------------------------------------------------------
1757d8d0e25Snbeams // 2D
1767d8d0e25Snbeams //------------------------------------------------------------------------------
1777d8d0e25Snbeams 
1787d8d0e25Snbeams //------------------------------------------------------------------------------
1797d8d0e25Snbeams // L-vector -> E-vector, offsets provided
1807d8d0e25Snbeams //------------------------------------------------------------------------------
1817d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
1827d8d0e25Snbeams inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
1837d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
1847d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
1857d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d*P1d];
1867d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1877d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
1887d8d0e25Snbeams   }
1897d8d0e25Snbeams }
1907d8d0e25Snbeams 
1917d8d0e25Snbeams //------------------------------------------------------------------------------
1927d8d0e25Snbeams // L-vector -> E-vector, strided
1937d8d0e25Snbeams //------------------------------------------------------------------------------
1947d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
1957d8d0e25Snbeams inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
1967d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
1977d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
1987d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
1997d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2007d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
2017d8d0e25Snbeams   }
2027d8d0e25Snbeams }
2037d8d0e25Snbeams 
2047d8d0e25Snbeams //------------------------------------------------------------------------------
2057d8d0e25Snbeams // E-vector -> L-vector, offsets provided
2067d8d0e25Snbeams //------------------------------------------------------------------------------
2077d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
2087d8d0e25Snbeams inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
2097d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2107d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2117d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d*P1d];
2127d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2137d8d0e25Snbeams       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
2147d8d0e25Snbeams   }
2157d8d0e25Snbeams }
2167d8d0e25Snbeams 
2177d8d0e25Snbeams //------------------------------------------------------------------------------
2187d8d0e25Snbeams // E-vector -> L-vector, strided
2197d8d0e25Snbeams //------------------------------------------------------------------------------
2207d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
2217d8d0e25Snbeams inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
2227d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2237d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2247d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
2257d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2267d8d0e25Snbeams       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
2277d8d0e25Snbeams   }
2287d8d0e25Snbeams }
2297d8d0e25Snbeams 
2307d8d0e25Snbeams //------------------------------------------------------------------------------
2317d8d0e25Snbeams // 2D tensor contraction x
2327d8d0e25Snbeams //------------------------------------------------------------------------------
2337d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2347d8d0e25Snbeams inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2357d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2367d8d0e25Snbeams   __syncthreads();
2377d8d0e25Snbeams   *V = 0.0;
2387d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < P1d)
2397d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
2407d8d0e25Snbeams       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
2417d8d0e25Snbeams   __syncthreads();
2427d8d0e25Snbeams }
2437d8d0e25Snbeams 
2447d8d0e25Snbeams //------------------------------------------------------------------------------
2457d8d0e25Snbeams // 2D tensor contract y
2467d8d0e25Snbeams //------------------------------------------------------------------------------
2477d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2487d8d0e25Snbeams inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2497d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2507d8d0e25Snbeams   __syncthreads();
2517d8d0e25Snbeams   *V = 0.0;
2527d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d)
2537d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
2547d8d0e25Snbeams       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
2557d8d0e25Snbeams   __syncthreads();
2567d8d0e25Snbeams }
2577d8d0e25Snbeams 
2587d8d0e25Snbeams //------------------------------------------------------------------------------
2597d8d0e25Snbeams // 2D transpose tensor contract y
2607d8d0e25Snbeams //------------------------------------------------------------------------------
2617d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2627d8d0e25Snbeams inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2637d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2647d8d0e25Snbeams   __syncthreads();
2657d8d0e25Snbeams   *V = 0.0;
2667d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < P1d)
2677d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
2687d8d0e25Snbeams       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
2697d8d0e25Snbeams   __syncthreads();
2707d8d0e25Snbeams }
2717d8d0e25Snbeams 
2727d8d0e25Snbeams //------------------------------------------------------------------------------
2737d8d0e25Snbeams // 2D transpose tensor contract x
2747d8d0e25Snbeams //------------------------------------------------------------------------------
2757d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2767d8d0e25Snbeams inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2777d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2787d8d0e25Snbeams   __syncthreads();
2797d8d0e25Snbeams   *V = 0.0;
2807d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
2817d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
2827d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
2837d8d0e25Snbeams   __syncthreads();
2847d8d0e25Snbeams }
2857d8d0e25Snbeams 
2867d8d0e25Snbeams //------------------------------------------------------------------------------
2877d8d0e25Snbeams // 2D transpose tensor contract and add x
2887d8d0e25Snbeams //------------------------------------------------------------------------------
2897d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2907d8d0e25Snbeams inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2917d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2927d8d0e25Snbeams   __syncthreads();
2937d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
2947d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
2957d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
2967d8d0e25Snbeams   __syncthreads();
2977d8d0e25Snbeams }
2987d8d0e25Snbeams 
2997d8d0e25Snbeams //------------------------------------------------------------------------------
3007d8d0e25Snbeams // 2D interpolate to quadrature points
3017d8d0e25Snbeams //------------------------------------------------------------------------------
3027d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3037d8d0e25Snbeams inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
3047d8d0e25Snbeams   CeedScalar r_t[1];
3057d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3067d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3077d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3087d8d0e25Snbeams   }
3097d8d0e25Snbeams }
3107d8d0e25Snbeams 
3117d8d0e25Snbeams //------------------------------------------------------------------------------
3127d8d0e25Snbeams // 2D interpolate transpose
3137d8d0e25Snbeams //------------------------------------------------------------------------------
3147d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3157d8d0e25Snbeams inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
3167d8d0e25Snbeams   CeedScalar r_t[1];
3177d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3187d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3197d8d0e25Snbeams     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3207d8d0e25Snbeams   }
3217d8d0e25Snbeams }
3227d8d0e25Snbeams 
3237d8d0e25Snbeams //------------------------------------------------------------------------------
3247d8d0e25Snbeams // 2D derivatives at quadrature points
3257d8d0e25Snbeams //------------------------------------------------------------------------------
3267d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3277d8d0e25Snbeams inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
3287d8d0e25Snbeams   CeedScalar r_t[1];
3297d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3307d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
3317d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
3327d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3337d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
3347d8d0e25Snbeams   }
3357d8d0e25Snbeams }
3367d8d0e25Snbeams 
3377d8d0e25Snbeams //------------------------------------------------------------------------------
3387d8d0e25Snbeams // 2D derivatives transpose
3397d8d0e25Snbeams //------------------------------------------------------------------------------
3407d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3417d8d0e25Snbeams inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
3427d8d0e25Snbeams   CeedScalar r_t[1];
3437d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3447d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
3457d8d0e25Snbeams     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
3467d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
3477d8d0e25Snbeams     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3487d8d0e25Snbeams   }
3497d8d0e25Snbeams }
3507d8d0e25Snbeams 
3517d8d0e25Snbeams //------------------------------------------------------------------------------
3527d8d0e25Snbeams // 3D
3537d8d0e25Snbeams //------------------------------------------------------------------------------
3547d8d0e25Snbeams 
3557d8d0e25Snbeams //------------------------------------------------------------------------------
3567d8d0e25Snbeams // L-vector -> E-vector, offsets provided
3577d8d0e25Snbeams //------------------------------------------------------------------------------
3587d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
3597d8d0e25Snbeams inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
3607d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3617d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
3627d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
3637d8d0e25Snbeams       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
3647d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3657d8d0e25Snbeams         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
3667d8d0e25Snbeams     }
3677d8d0e25Snbeams }
3687d8d0e25Snbeams 
3697d8d0e25Snbeams //------------------------------------------------------------------------------
3707d8d0e25Snbeams // L-vector -> E-vector, strided
3717d8d0e25Snbeams //------------------------------------------------------------------------------
3727d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
3737d8d0e25Snbeams inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
3747d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3757d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
3767d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
3777d8d0e25Snbeams       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
3787d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3797d8d0e25Snbeams         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
3807d8d0e25Snbeams     }
3817d8d0e25Snbeams }
3827d8d0e25Snbeams 
3837d8d0e25Snbeams //------------------------------------------------------------------------------
3847d8d0e25Snbeams // E-vector -> Q-vector, offests provided
3857d8d0e25Snbeams //------------------------------------------------------------------------------
3867d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int Q1d>
3877d8d0e25Snbeams inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
3887d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
3897d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
3907d8d0e25Snbeams     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
3917d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
3927d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
3937d8d0e25Snbeams   }
3947d8d0e25Snbeams }
3957d8d0e25Snbeams 
3967d8d0e25Snbeams //------------------------------------------------------------------------------
3977d8d0e25Snbeams // E-vector -> Q-vector, strided
3987d8d0e25Snbeams //------------------------------------------------------------------------------
3997d8d0e25Snbeams template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4007d8d0e25Snbeams inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
4017d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
4027d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
4037d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
4047d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4057d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
4067d8d0e25Snbeams   }
4077d8d0e25Snbeams }
4087d8d0e25Snbeams 
4097d8d0e25Snbeams //------------------------------------------------------------------------------
4107d8d0e25Snbeams // E-vector -> L-vector, offsets provided
4117d8d0e25Snbeams //------------------------------------------------------------------------------
4127d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
4137d8d0e25Snbeams inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
4147d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
4157d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
4167d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4177d8d0e25Snbeams       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
4187d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4197d8d0e25Snbeams         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
4207d8d0e25Snbeams     }
4217d8d0e25Snbeams }
4227d8d0e25Snbeams 
4237d8d0e25Snbeams //------------------------------------------------------------------------------
4247d8d0e25Snbeams // E-vector -> L-vector, strided
4257d8d0e25Snbeams //------------------------------------------------------------------------------
4267d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4277d8d0e25Snbeams inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
4287d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
4297d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
4307d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4317d8d0e25Snbeams       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
4327d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4337d8d0e25Snbeams         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
4347d8d0e25Snbeams     }
4357d8d0e25Snbeams }
4367d8d0e25Snbeams 
4377d8d0e25Snbeams //------------------------------------------------------------------------------
4387d8d0e25Snbeams // 3D tensor contract x
4397d8d0e25Snbeams //------------------------------------------------------------------------------
4407d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
4417d8d0e25Snbeams inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4427d8d0e25Snbeams   CeedScalar r_B[P1d];
4437d8d0e25Snbeams   for (CeedInt i = 0; i < P1d; ++i)
4447d8d0e25Snbeams     r_B[i] = B[i + data.tidx*P1d];
4457d8d0e25Snbeams 
4467d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
4477d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
4487d8d0e25Snbeams     __syncthreads();
4497d8d0e25Snbeams     V[k] = 0.0;
4507d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
4517d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
4527d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
4537d8d0e25Snbeams     __syncthreads();
4547d8d0e25Snbeams   }
4557d8d0e25Snbeams }
4567d8d0e25Snbeams 
4577d8d0e25Snbeams //------------------------------------------------------------------------------
4587d8d0e25Snbeams // 3D tensor contract y
4597d8d0e25Snbeams //------------------------------------------------------------------------------
4607d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
4617d8d0e25Snbeams inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4627d8d0e25Snbeams   CeedScalar r_B[P1d];
4637d8d0e25Snbeams   for (CeedInt i = 0; i < P1d; ++i)
4647d8d0e25Snbeams     r_B[i] = B[i + data.tidy*P1d];
4657d8d0e25Snbeams 
4667d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
4677d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
4687d8d0e25Snbeams     __syncthreads();
4697d8d0e25Snbeams     V[k] = 0.0;
4707d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
4717d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
4727d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
4737d8d0e25Snbeams     __syncthreads();
4747d8d0e25Snbeams   }
4757d8d0e25Snbeams }
4767d8d0e25Snbeams 
4777d8d0e25Snbeams //------------------------------------------------------------------------------
4787d8d0e25Snbeams // 3D tensor contract z
4797d8d0e25Snbeams //------------------------------------------------------------------------------
4807d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
4817d8d0e25Snbeams inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4827d8d0e25Snbeams   for (CeedInt k = 0; k < Q1d; ++k) {
4837d8d0e25Snbeams     V[k] = 0.0;
4847d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
4857d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
4867d8d0e25Snbeams         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
4877d8d0e25Snbeams   }
4887d8d0e25Snbeams }
4897d8d0e25Snbeams 
4907d8d0e25Snbeams //------------------------------------------------------------------------------
4917d8d0e25Snbeams // 3D transpose tensor contract z
4927d8d0e25Snbeams //------------------------------------------------------------------------------
4937d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
4947d8d0e25Snbeams inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4957d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
4967d8d0e25Snbeams     V[k] = 0.0;
4977d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
4987d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
4997d8d0e25Snbeams         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
5007d8d0e25Snbeams   }
5017d8d0e25Snbeams }
5027d8d0e25Snbeams 
5037d8d0e25Snbeams //------------------------------------------------------------------------------
5047d8d0e25Snbeams // 3D transpose tensor contract y
5057d8d0e25Snbeams //------------------------------------------------------------------------------
5067d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5077d8d0e25Snbeams inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5087d8d0e25Snbeams   CeedScalar r_B[Q1d];
5097d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5107d8d0e25Snbeams     r_B[i] = B[data.tidy + i*P1d];
5117d8d0e25Snbeams 
5127d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5137d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5147d8d0e25Snbeams     __syncthreads();
5157d8d0e25Snbeams     V[k] = 0.0;
5167d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
5177d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5187d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
5197d8d0e25Snbeams     __syncthreads();
5207d8d0e25Snbeams   }
5217d8d0e25Snbeams }
5227d8d0e25Snbeams 
5237d8d0e25Snbeams //------------------------------------------------------------------------------
5247d8d0e25Snbeams // 3D transpose tensor contract add y
5257d8d0e25Snbeams //------------------------------------------------------------------------------
5267d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5277d8d0e25Snbeams inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5287d8d0e25Snbeams   CeedScalar r_B[Q1d];
5297d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5307d8d0e25Snbeams     r_B[i] = B[data.tidy + i*P1d];
5317d8d0e25Snbeams 
5327d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5337d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5347d8d0e25Snbeams     __syncthreads();
5357d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
5367d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5377d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
5387d8d0e25Snbeams     __syncthreads();
5397d8d0e25Snbeams   }
5407d8d0e25Snbeams }
5417d8d0e25Snbeams 
5427d8d0e25Snbeams //------------------------------------------------------------------------------
5437d8d0e25Snbeams // 3D transpose tensor contract x
5447d8d0e25Snbeams //------------------------------------------------------------------------------
5457d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5467d8d0e25Snbeams inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5477d8d0e25Snbeams   CeedScalar r_B[Q1d];
5487d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5497d8d0e25Snbeams     r_B[i] = B[data.tidx + i*P1d];
5507d8d0e25Snbeams 
5517d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5527d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5537d8d0e25Snbeams     __syncthreads();
5547d8d0e25Snbeams     V[k] = 0.0;
5557d8d0e25Snbeams     if (data.tidx < P1d && data.tidy < P1d)
5567d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5577d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
5587d8d0e25Snbeams     __syncthreads();
5597d8d0e25Snbeams   }
5607d8d0e25Snbeams }
5617d8d0e25Snbeams 
5627d8d0e25Snbeams //------------------------------------------------------------------------------
5637d8d0e25Snbeams // 3D transpose tensor contract add x
5647d8d0e25Snbeams //------------------------------------------------------------------------------
5657d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5667d8d0e25Snbeams inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5677d8d0e25Snbeams   CeedScalar r_B[Q1d];
5687d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5697d8d0e25Snbeams     r_B[i] = B[data.tidx + i*P1d];
5707d8d0e25Snbeams 
5717d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5727d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5737d8d0e25Snbeams     __syncthreads();
5747d8d0e25Snbeams     if (data.tidx < P1d && data.tidy < P1d)
5757d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5767d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
5777d8d0e25Snbeams     __syncthreads();
5787d8d0e25Snbeams   }
5797d8d0e25Snbeams }
5807d8d0e25Snbeams 
5817d8d0e25Snbeams //------------------------------------------------------------------------------
5827d8d0e25Snbeams // 3D interpolate to quadrature points
5837d8d0e25Snbeams //------------------------------------------------------------------------------
5847d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5857d8d0e25Snbeams inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
5867d8d0e25Snbeams   CeedScalar r_t1[T1d];
5877d8d0e25Snbeams   CeedScalar r_t2[T1d];
5887d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
5897d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
5907d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
5917d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
5927d8d0e25Snbeams   }
5937d8d0e25Snbeams }
5947d8d0e25Snbeams 
5957d8d0e25Snbeams //------------------------------------------------------------------------------
5967d8d0e25Snbeams // 3D interpolate transpose
5977d8d0e25Snbeams //------------------------------------------------------------------------------
5987d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5997d8d0e25Snbeams inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
6007d8d0e25Snbeams   CeedScalar r_t1[T1d];
6017d8d0e25Snbeams   CeedScalar r_t2[T1d];
6027d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6037d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
6047d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6057d8d0e25Snbeams     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6067d8d0e25Snbeams   }
6077d8d0e25Snbeams }
6087d8d0e25Snbeams 
6097d8d0e25Snbeams //------------------------------------------------------------------------------
6107d8d0e25Snbeams // 3D derivatives at quadrature points
6117d8d0e25Snbeams //------------------------------------------------------------------------------
6127d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6137d8d0e25Snbeams inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6147d8d0e25Snbeams   CeedScalar r_t1[T1d];
6157d8d0e25Snbeams   CeedScalar r_t2[T1d];
6167d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6177d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
6187d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6197d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
6207d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
6217d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
6227d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
6237d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
6247d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6257d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
6267d8d0e25Snbeams   }
6277d8d0e25Snbeams }
6287d8d0e25Snbeams 
6297d8d0e25Snbeams //------------------------------------------------------------------------------
6307d8d0e25Snbeams // 3D derivatives transpose
6317d8d0e25Snbeams //------------------------------------------------------------------------------
6327d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6337d8d0e25Snbeams inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6347d8d0e25Snbeams   CeedScalar r_t1[T1d];
6357d8d0e25Snbeams   CeedScalar r_t2[T1d];
6367d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6377d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
6387d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6397d8d0e25Snbeams     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
6407d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
6417d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
6427d8d0e25Snbeams     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6437d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
6447d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6457d8d0e25Snbeams     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6467d8d0e25Snbeams   }
6477d8d0e25Snbeams }
6487d8d0e25Snbeams 
6497d8d0e25Snbeams //------------------------------------------------------------------------------
6507d8d0e25Snbeams // 3D collocated derivatives computation
6517d8d0e25Snbeams //------------------------------------------------------------------------------
6527d8d0e25Snbeams template <int NCOMP, int Q1d>
6537d8d0e25Snbeams inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6547d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
6557d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
6567d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
6577d8d0e25Snbeams       __syncthreads();
6587d8d0e25Snbeams       // X derivative
6597d8d0e25Snbeams       r_V[comp+0*NCOMP] = 0.0;
6607d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6617d8d0e25Snbeams         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
6627d8d0e25Snbeams       // Y derivative
6637d8d0e25Snbeams       r_V[comp+1*NCOMP] = 0.0;
6647d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6657d8d0e25Snbeams         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
6667d8d0e25Snbeams       // Z derivative
6677d8d0e25Snbeams       r_V[comp+2*NCOMP] = 0.0;
6687d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6697d8d0e25Snbeams         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
6707d8d0e25Snbeams       __syncthreads();
6717d8d0e25Snbeams     }
6727d8d0e25Snbeams   }
6737d8d0e25Snbeams }
6747d8d0e25Snbeams 
6757d8d0e25Snbeams //------------------------------------------------------------------------------
6767d8d0e25Snbeams // 3D collocated derivatives transpose
6777d8d0e25Snbeams //------------------------------------------------------------------------------
6787d8d0e25Snbeams template <int NCOMP, int Q1d>
6797d8d0e25Snbeams inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6807d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
6817d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
6827d8d0e25Snbeams       // X derivative
6837d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
6847d8d0e25Snbeams       __syncthreads();
6857d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6867d8d0e25Snbeams         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
6877d8d0e25Snbeams       __syncthreads();
6887d8d0e25Snbeams       // Y derivative
6897d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
6907d8d0e25Snbeams       __syncthreads();
6917d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6927d8d0e25Snbeams         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
6937d8d0e25Snbeams       __syncthreads();
6947d8d0e25Snbeams       // Z derivative
6957d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6967d8d0e25Snbeams         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
6977d8d0e25Snbeams     }
6987d8d0e25Snbeams   }
6997d8d0e25Snbeams }
7007d8d0e25Snbeams 
7017d8d0e25Snbeams //------------------------------------------------------------------------------
7027d8d0e25Snbeams // 1D quadrature weights
7037d8d0e25Snbeams //------------------------------------------------------------------------------
7047d8d0e25Snbeams template <int Q1d>
7057d8d0e25Snbeams inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7067d8d0e25Snbeams   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
7077d8d0e25Snbeams }
7087d8d0e25Snbeams 
7097d8d0e25Snbeams //------------------------------------------------------------------------------
7107d8d0e25Snbeams // 2D quadrature weights
7117d8d0e25Snbeams //------------------------------------------------------------------------------
7127d8d0e25Snbeams template <int Q1d>
7137d8d0e25Snbeams inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7147d8d0e25Snbeams   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
7157d8d0e25Snbeams         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
7167d8d0e25Snbeams }
7177d8d0e25Snbeams 
7187d8d0e25Snbeams //------------------------------------------------------------------------------
7197d8d0e25Snbeams // 3D quadrature weights
7207d8d0e25Snbeams //------------------------------------------------------------------------------
7217d8d0e25Snbeams template <int Q1d>
7227d8d0e25Snbeams inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7237d8d0e25Snbeams   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
7247d8d0e25Snbeams   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
7257d8d0e25Snbeams   for (CeedInt z = 0; z < Q1d; ++z)
7267d8d0e25Snbeams     w[z] = quad ? pw*qweight1d[z] : 0.0;
7277d8d0e25Snbeams }
7287d8d0e25Snbeams 
7297d8d0e25Snbeams );
7307d8d0e25Snbeams //------------------------------------------------------------------------------
7317d8d0e25Snbeams // Build singe operator kernel
7327d8d0e25Snbeams //------------------------------------------------------------------------------
733680aa83fSnbeams CEED_INTERN int CeedHipGenOperatorBuild(CeedOperator op) {
7347d8d0e25Snbeams 
7357d8d0e25Snbeams   using std::ostringstream;
7367d8d0e25Snbeams   using std::string;
7377d8d0e25Snbeams   int ierr;
7387d8d0e25Snbeams   bool setupdone;
739e15f9bd0SJeremy L Thompson   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
740e15f9bd0SJeremy L Thompson   if (setupdone) return CEED_ERROR_SUCCESS;
7417d8d0e25Snbeams   Ceed ceed;
742e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
7437d8d0e25Snbeams   CeedOperator_Hip_gen *data;
744e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
7457d8d0e25Snbeams   CeedQFunction qf;
7467d8d0e25Snbeams   CeedQFunction_Hip_gen *qf_data;
747e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
748e15f9bd0SJeremy L Thompson   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
74978464608Sjeremylt   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
7507d8d0e25Snbeams           numoutputfields, ncomp, dim = 0, lsize;
751e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
752e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
7537d8d0e25Snbeams   CeedOperatorField *opinputfields, *opoutputfields;
7547e7773b5SJeremy L Thompson   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
755e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
7567d8d0e25Snbeams   CeedQFunctionField *qfinputfields, *qfoutputfields;
7577e7773b5SJeremy L Thompson   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
758e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
7597d8d0e25Snbeams   CeedEvalMode emode;
7607d8d0e25Snbeams   CeedBasis basis;
7617d8d0e25Snbeams   CeedBasis_Hip_shared *basis_data;
7627d8d0e25Snbeams   CeedElemRestriction Erestrict;
7637d8d0e25Snbeams   CeedElemRestriction_Hip *restr_data;
7647d8d0e25Snbeams 
7650b454692Sjeremylt   // Check for restriction only identity operator
7660b454692Sjeremylt   bool is_identity_qf;
7670b454692Sjeremylt   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
7680b454692Sjeremylt   if (is_identity_qf) {
7690b454692Sjeremylt     CeedEvalMode emodein, emodeout;
7700b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
7710b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
7720b454692Sjeremylt     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
7730b454692Sjeremylt       // LCOV_EXCL_START
7740b454692Sjeremylt       return CeedError(ceed, CEED_ERROR_BACKEND,
7750b454692Sjeremylt                        "Backend does not implement restriction only identity operators");
7760b454692Sjeremylt     // LCOV_EXCL_STOP
7770b454692Sjeremylt   }
7780b454692Sjeremylt 
7797d8d0e25Snbeams   ostringstream code;
7807d8d0e25Snbeams   string devFunctions(deviceFunctions);
7817d8d0e25Snbeams   code << devFunctions;
7827d8d0e25Snbeams 
7837d8d0e25Snbeams   string qFunction(qf_data->qFunctionSource);
7847d8d0e25Snbeams   string qFunctionName(qf_data->qFunctionName);
7857d8d0e25Snbeams   string oper;
7867d8d0e25Snbeams   oper = "CeedKernel_Hip_gen_" + qFunctionName;
7877d8d0e25Snbeams 
7887d8d0e25Snbeams   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
78977841947SLeila Ghaffari   code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n";
790e15f9bd0SJeremy L Thompson   code << "#define CeedPragmaSIMD\n";
791e15f9bd0SJeremy L Thompson   code << "#define CEED_ERROR_SUCCESS 0\n\n";
7927d8d0e25Snbeams 
7937d8d0e25Snbeams   // Find dim and Q1d
7947d8d0e25Snbeams   bool useCollograd = true;
7957d8d0e25Snbeams   data->maxP1d = 0;
7967d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
797e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
7987d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
799e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8007d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
801e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8027d8d0e25Snbeams 
8037d8d0e25Snbeams       // Check for collocated gradient
804437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8057d8d0e25Snbeams 
8067d8d0e25Snbeams       // Collect dim and Q1d
807e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8087d8d0e25Snbeams       bool isTensor;
809e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8107d8d0e25Snbeams       if (isTensor) {
811e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
812e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
8137d8d0e25Snbeams         if (P1d>data->maxP1d) data->maxP1d = P1d;
8147d8d0e25Snbeams       } else {
8157d8d0e25Snbeams         // LCOV_EXCL_START
816e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
8177d8d0e25Snbeams         // LCOV_EXCL_STOP
8187d8d0e25Snbeams         }
8197d8d0e25Snbeams     }
8207d8d0e25Snbeams   }
8217d8d0e25Snbeams   // Check output bases for Q1d, dim as well
8227d8d0e25Snbeams   //   The only imput basis might be CEED_BASIS_COLLOCATED
8237d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
824e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
8257d8d0e25Snbeams 
8267d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
827e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8287d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
829e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8307d8d0e25Snbeams 
8317d8d0e25Snbeams       // Collect dim and Q1d
832e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8337d8d0e25Snbeams       bool isTensor;
834e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8357d8d0e25Snbeams       if (isTensor) {
836e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
8377d8d0e25Snbeams       } else {
8387d8d0e25Snbeams         // LCOV_EXCL_START
839e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
8407d8d0e25Snbeams         // LCOV_EXCL_STOP
8417d8d0e25Snbeams         }
8427d8d0e25Snbeams 
8437d8d0e25Snbeams       // Check for collocated gradient
844437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8457d8d0e25Snbeams     }
8467d8d0e25Snbeams   }
8477d8d0e25Snbeams   data->dim = dim;
8487d8d0e25Snbeams   data->Q1d = Q1d;
8497d8d0e25Snbeams 
8507d8d0e25Snbeams   // Define CEED_Q_VLA
8517d8d0e25Snbeams   if (dim != 3 || useCollograd) {
8527d8d0e25Snbeams     code << "\n#define CEED_Q_VLA 1\n\n";
8537d8d0e25Snbeams   } else {
8547d8d0e25Snbeams     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
8557d8d0e25Snbeams   }
8567d8d0e25Snbeams 
8577d8d0e25Snbeams   code << qFunction;
8587d8d0e25Snbeams 
8597d8d0e25Snbeams   // Setup
8607d8d0e25Snbeams   code << "\n// -----------------------------------------------------------------------------\n";
8617d8d0e25Snbeams   code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
8627d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
8637d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
864e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8657d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
8667d8d0e25Snbeams       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
8677d8d0e25Snbeams     }
8687d8d0e25Snbeams   }
8697d8d0e25Snbeams 
8707d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
8717d8d0e25Snbeams     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
8727d8d0e25Snbeams   }
8737d8d0e25Snbeams 
8747d8d0e25Snbeams   code << "  const CeedInt Dim = "<<dim<<";\n";
8757d8d0e25Snbeams   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
8767d8d0e25Snbeams 
8777d8d0e25Snbeams   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
8787d8d0e25Snbeams   code << "  BackendData data;\n";
8797d8d0e25Snbeams   code << "  data.tidx = threadIdx.x;\n";
8807d8d0e25Snbeams   code << "  data.tidy = threadIdx.y;\n";
8817d8d0e25Snbeams   code << "  data.tidz = threadIdx.z;\n";
8827d8d0e25Snbeams   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
8837d8d0e25Snbeams   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
8847d8d0e25Snbeams 
8857d8d0e25Snbeams   code << "\n  // -- Input field constants and basis data --\n";
8867d8d0e25Snbeams   //Initialize constants, and matrices B and G
8877d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
8887d8d0e25Snbeams     code << "  // ---- Input field "<<i<<" ----\n";
8897d8d0e25Snbeams     // Get elemsize, emode, ncomp
8907d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
891e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8927d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
893e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8947d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
895e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8967d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
897e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8987d8d0e25Snbeams 
8997d8d0e25Snbeams     // Set field constants
9007d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT) {
901e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
9027d8d0e25Snbeams       if (basis != CEED_BASIS_COLLOCATED) {
903e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
9047d8d0e25Snbeams         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
9057d8d0e25Snbeams       } else {
9067d8d0e25Snbeams         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
9077d8d0e25Snbeams       }
9087d8d0e25Snbeams       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
9097d8d0e25Snbeams     }
9107d8d0e25Snbeams 
9117d8d0e25Snbeams     // Load basis data
9127d8d0e25Snbeams     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
9137d8d0e25Snbeams     switch (emode) {
9147d8d0e25Snbeams     case CEED_EVAL_NONE:
9157d8d0e25Snbeams       break;
9167d8d0e25Snbeams     case CEED_EVAL_INTERP:
917e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
918437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
91980a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9207d8d0e25Snbeams       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
9217d8d0e25Snbeams       break;
9227d8d0e25Snbeams     case CEED_EVAL_GRAD:
923e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
924437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
92580a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9267d8d0e25Snbeams       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
9277d8d0e25Snbeams       if (useCollograd) {
928437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_collo_grad_1d;
92980a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
9307d8d0e25Snbeams         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
9317d8d0e25Snbeams       } else {
932437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_grad_1d;
93380a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9347d8d0e25Snbeams         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
9357d8d0e25Snbeams       }
9367d8d0e25Snbeams       break;
9377d8d0e25Snbeams     case CEED_EVAL_WEIGHT:
9387d8d0e25Snbeams       break; // No action
9397d8d0e25Snbeams     case CEED_EVAL_DIV:
9407d8d0e25Snbeams       break; // TODO: Not implemented
9417d8d0e25Snbeams     case CEED_EVAL_CURL:
9427d8d0e25Snbeams       break; // TODO: Not implemented
9437d8d0e25Snbeams     }
9447d8d0e25Snbeams   }
9457d8d0e25Snbeams 
9467d8d0e25Snbeams   code << "\n  // -- Output field constants and basis data --\n";
9477d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
9487d8d0e25Snbeams     code << "  // ---- Output field "<<i<<" ----\n";
9497d8d0e25Snbeams     // Get elemsize, emode, ncomp
9507d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
951e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9527d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
953e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9547d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
955e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9567d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
957e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9587d8d0e25Snbeams 
9597d8d0e25Snbeams     // Set field constants
960e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
9617d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
962e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
9637d8d0e25Snbeams       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
9647d8d0e25Snbeams     } else {
9657d8d0e25Snbeams       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
9667d8d0e25Snbeams     }
9677d8d0e25Snbeams     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
9687d8d0e25Snbeams 
9697d8d0e25Snbeams     // Load basis data
9707d8d0e25Snbeams     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
9717d8d0e25Snbeams     switch (emode) {
9727d8d0e25Snbeams     case CEED_EVAL_NONE:
9737d8d0e25Snbeams       break; // No action
9747d8d0e25Snbeams     case CEED_EVAL_INTERP:
975e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
976437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
97780a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
9787d8d0e25Snbeams       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
9797d8d0e25Snbeams       break;
9807d8d0e25Snbeams     case CEED_EVAL_GRAD:
981e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
982437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
98380a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
9847d8d0e25Snbeams       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
9857d8d0e25Snbeams       if (useCollograd) {
986437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_collo_grad_1d;
98780a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
9887d8d0e25Snbeams         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
9897d8d0e25Snbeams       } else {
990437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_grad_1d;
99180a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
9927d8d0e25Snbeams         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
9937d8d0e25Snbeams       }
9947d8d0e25Snbeams       break;
9957d8d0e25Snbeams     // LCOV_EXCL_START
9967d8d0e25Snbeams     case CEED_EVAL_WEIGHT: {
9977d8d0e25Snbeams       Ceed ceed;
998e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
999e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
10007d8d0e25Snbeams                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
10017d8d0e25Snbeams       break; // Should not occur
10027d8d0e25Snbeams     }
10037d8d0e25Snbeams     case CEED_EVAL_DIV:
10047d8d0e25Snbeams       break; // TODO: Not implemented
10057d8d0e25Snbeams     case CEED_EVAL_CURL:
10067d8d0e25Snbeams       break; // TODO: Not implemented
10077d8d0e25Snbeams       // LCOV_EXCL_STOP
10087d8d0e25Snbeams     }
10097d8d0e25Snbeams   }
10107d8d0e25Snbeams   code << "\n  // -- Element loop --\n";
10117d8d0e25Snbeams   code << "  __syncthreads();\n";
10127d8d0e25Snbeams   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
10137d8d0e25Snbeams   // Input basis apply if needed
10147d8d0e25Snbeams   // Generate the correct eval mode code for each input
10157d8d0e25Snbeams   code << "    // -- Input field restrictions and basis actions --\n";
10167d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
10177d8d0e25Snbeams     code << "    // ---- Input field "<<i<<" ----\n";
10187d8d0e25Snbeams     // Get elemsize, emode, ncomp
10197d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1020e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10217d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1022e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10237d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1024e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10257d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1026e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10277d8d0e25Snbeams 
10287d8d0e25Snbeams     // Restriction
10297d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT &&
10307d8d0e25Snbeams         !((emode == CEED_EVAL_NONE) && useCollograd)) {
10317d8d0e25Snbeams       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
10327d8d0e25Snbeams 
10337d8d0e25Snbeams       bool isStrided;
1034e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
10357d8d0e25Snbeams       if (!isStrided) {
10367d8d0e25Snbeams         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1037e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10387d8d0e25Snbeams         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
10397d8d0e25Snbeams         CeedInt compstride;
1040e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
10417d8d0e25Snbeams         code << "    // CompStride: "<<compstride<<"\n";
1042e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
10437d8d0e25Snbeams         data->indices.in[i] = restr_data->d_ind;
10447d8d0e25Snbeams         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
10457d8d0e25Snbeams       } else {
10467d8d0e25Snbeams         bool backendstrides;
10477d8d0e25Snbeams         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1048e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10497d8d0e25Snbeams         CeedInt nelem;
10507d8d0e25Snbeams         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1051e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10527d8d0e25Snbeams         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
10537d8d0e25Snbeams         if (!backendstrides) {
10547d8d0e25Snbeams           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1055e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
10567d8d0e25Snbeams         }
10577d8d0e25Snbeams         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
10587d8d0e25Snbeams         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
10597d8d0e25Snbeams       }
10607d8d0e25Snbeams     }
10617d8d0e25Snbeams 
10627d8d0e25Snbeams     // Basis action
10637d8d0e25Snbeams     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
10647d8d0e25Snbeams     switch (emode) {
10657d8d0e25Snbeams     case CEED_EVAL_NONE:
10667d8d0e25Snbeams       if (!useCollograd) {
10677d8d0e25Snbeams         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
10687d8d0e25Snbeams       }
10697d8d0e25Snbeams       break;
10707d8d0e25Snbeams     case CEED_EVAL_INTERP:
10717d8d0e25Snbeams       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
10727d8d0e25Snbeams       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
10737d8d0e25Snbeams       break;
10747d8d0e25Snbeams     case CEED_EVAL_GRAD:
10757d8d0e25Snbeams       if (useCollograd) {
10767d8d0e25Snbeams         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
10777d8d0e25Snbeams         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
10787d8d0e25Snbeams       } else {
10797d8d0e25Snbeams         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
10807d8d0e25Snbeams         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
10817d8d0e25Snbeams       }
10827d8d0e25Snbeams       break;
10837d8d0e25Snbeams     case CEED_EVAL_WEIGHT:
10847d8d0e25Snbeams       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1085e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1086e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1087437930d1SJeremy L Thompson       data->W = basis_data->d_q_weight_1d;
10887d8d0e25Snbeams       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
10897d8d0e25Snbeams       break; // No action
10907d8d0e25Snbeams     case CEED_EVAL_DIV:
10917d8d0e25Snbeams       break; // TODO: Not implemented
10927d8d0e25Snbeams     case CEED_EVAL_CURL:
10937d8d0e25Snbeams       break; // TODO: Not implemented
10947d8d0e25Snbeams     }
10957d8d0e25Snbeams   }
10967d8d0e25Snbeams 
10977d8d0e25Snbeams   // Q function
10987d8d0e25Snbeams   code << "\n    // -- Output field setup --\n";
10997d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
11007d8d0e25Snbeams       code << "\n    // ---- Output field "<<i<<" ----\n";
11017d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1102e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
11037d8d0e25Snbeams     if (emode==CEED_EVAL_GRAD)
11047d8d0e25Snbeams     {
11057d8d0e25Snbeams       if (useCollograd) {
11067d8d0e25Snbeams         //Accumulator for gradient slices
11077d8d0e25Snbeams         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
11087d8d0e25Snbeams         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
11097d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
11107d8d0e25Snbeams         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
11117d8d0e25Snbeams         code << "      }\n";
11127d8d0e25Snbeams         code << "    }\n";
11137d8d0e25Snbeams       } else {
11147d8d0e25Snbeams         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
11157d8d0e25Snbeams       }
11167d8d0e25Snbeams     }
11177d8d0e25Snbeams     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
11187d8d0e25Snbeams     {
11197d8d0e25Snbeams       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
11207d8d0e25Snbeams     }
11217d8d0e25Snbeams   }
11227d8d0e25Snbeams   // We treat quadrature points per slice in 3d to save registers
11237d8d0e25Snbeams   if (useCollograd) {
11247d8d0e25Snbeams     code << "\n    // Note: Collocated Gradient\n";
11257d8d0e25Snbeams     code << "#pragma unroll\n";
11267d8d0e25Snbeams     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
11277d8d0e25Snbeams     code << "      // -- Input fields --\n";
11287d8d0e25Snbeams     for (CeedInt i = 0; i < numinputfields; i++) {
11297d8d0e25Snbeams       code << "      // ---- Input field "<<i<<" ----\n";
11307d8d0e25Snbeams       // Get elemsize, emode, ncomp
11317d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1132e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
11337d8d0e25Snbeams       // Basis action
11347d8d0e25Snbeams       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
11357d8d0e25Snbeams       switch (emode) {
11367d8d0e25Snbeams       case CEED_EVAL_NONE:
11377d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11387d8d0e25Snbeams 
11397d8d0e25Snbeams         bool isStrided;
1140e15f9bd0SJeremy L Thompson         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1141e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1142e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
11437d8d0e25Snbeams         if (!isStrided) {
11447d8d0e25Snbeams           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1145e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11467d8d0e25Snbeams           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
11477d8d0e25Snbeams           CeedInt compstride;
1148e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
11497d8d0e25Snbeams           code << "      // CompStride: "<<compstride<<"\n";
1150e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
11517d8d0e25Snbeams           data->indices.in[i] = restr_data->d_ind;
11527d8d0e25Snbeams           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
11537d8d0e25Snbeams         } else {
11547d8d0e25Snbeams           bool backendstrides;
11557d8d0e25Snbeams           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1156e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11577d8d0e25Snbeams           CeedInt nelem;
11587d8d0e25Snbeams           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1159e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11607d8d0e25Snbeams           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
11617d8d0e25Snbeams           if (!backendstrides) {
11627d8d0e25Snbeams             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1163e15f9bd0SJeremy L Thompson             CeedChkBackend(ierr);
11647d8d0e25Snbeams           }
11657d8d0e25Snbeams           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
11667d8d0e25Snbeams           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
11677d8d0e25Snbeams         }
11687d8d0e25Snbeams         break;
11697d8d0e25Snbeams       case CEED_EVAL_INTERP:
11707d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11717d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
11727d8d0e25Snbeams         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
11737d8d0e25Snbeams         code << "      }\n";
11747d8d0e25Snbeams         break;
11757d8d0e25Snbeams       case CEED_EVAL_GRAD:
11767d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
11777d8d0e25Snbeams         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
11787d8d0e25Snbeams         break;
11797d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
11807d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[1];\n";
11817d8d0e25Snbeams         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
11827d8d0e25Snbeams         break; // No action
11837d8d0e25Snbeams       case CEED_EVAL_DIV:
11847d8d0e25Snbeams         break; // TODO: Not implemented
11857d8d0e25Snbeams       case CEED_EVAL_CURL:
11867d8d0e25Snbeams         break; // TODO: Not implemented
11877d8d0e25Snbeams       }
11887d8d0e25Snbeams     }
11897d8d0e25Snbeams     code << "\n      // -- Output fields --\n";
11907d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
11917d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
11927d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1193e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
11947d8d0e25Snbeams       // Basis action
11957d8d0e25Snbeams       switch (emode) {
11967d8d0e25Snbeams       case CEED_EVAL_NONE:
11977d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
11987d8d0e25Snbeams         break; // No action
11997d8d0e25Snbeams       case CEED_EVAL_INTERP:
12007d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
12017d8d0e25Snbeams         break;
12027d8d0e25Snbeams       case CEED_EVAL_GRAD:
12037d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
12047d8d0e25Snbeams         break;
12057d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
12067d8d0e25Snbeams         break; // Should not occur
12077d8d0e25Snbeams       case CEED_EVAL_DIV:
12087d8d0e25Snbeams         break; // TODO: Not implemented
12097d8d0e25Snbeams       case CEED_EVAL_CURL:
12107d8d0e25Snbeams         break; // TODO: Not implemented
12117d8d0e25Snbeams       }
12127d8d0e25Snbeams     }
12137d8d0e25Snbeams   } else {
12147d8d0e25Snbeams     code << "\n      // Note: No Collocated Gradient\n";
12157d8d0e25Snbeams     code << "      // -- Input fields --\n";
12167d8d0e25Snbeams     for (CeedInt i = 0; i < numinputfields; i++) {
12177d8d0e25Snbeams       code << "      // ---- Input field "<<i<<" ----\n";
12187d8d0e25Snbeams       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
12197d8d0e25Snbeams     }
12207d8d0e25Snbeams     code << "      // -- Output fields --\n";
12217d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
12227d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
12237d8d0e25Snbeams       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
12247d8d0e25Snbeams     }
12257d8d0e25Snbeams   }
12267d8d0e25Snbeams   code << "\n      // -- QFunction Inputs and outputs --\n";
12277d8d0e25Snbeams   code << "      CeedScalar* in["<<numinputfields<<"];\n";
12287d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
12297d8d0e25Snbeams     code << "      // ---- Input field "<<i<<" ----\n";
12307d8d0e25Snbeams     code << "      in["<<i<<"] = r_q"<<i<<";\n";
12317d8d0e25Snbeams   }
12327d8d0e25Snbeams   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
12337d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
12347d8d0e25Snbeams     code << "      // ---- Output field "<<i<<" ----\n";
12357d8d0e25Snbeams     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
12367d8d0e25Snbeams   }
12377d8d0e25Snbeams   code << "\n      // -- Apply QFunction --\n";
12387d8d0e25Snbeams   code << "      "<<qFunctionName<<"(ctx, ";
12397d8d0e25Snbeams   if (dim != 3 || useCollograd) {
12407d8d0e25Snbeams     code << "1";
12417d8d0e25Snbeams   } else {
12427d8d0e25Snbeams     code << "Q1d";
12437d8d0e25Snbeams   }
12447d8d0e25Snbeams   code << ", in, out);\n";
12457d8d0e25Snbeams   if (useCollograd) {
12467d8d0e25Snbeams     code << "\n      // Note: Collocated Gradient\n";
12477d8d0e25Snbeams     code << "      // -- Output fields --\n";
12487d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
12497d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
12507d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1251e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
12527d8d0e25Snbeams       // Basis action
12537d8d0e25Snbeams       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
12547d8d0e25Snbeams       switch (emode) {
12557d8d0e25Snbeams       case CEED_EVAL_NONE:
12567d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
12577d8d0e25Snbeams         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
12587d8d0e25Snbeams         code << "      }\n";
12597d8d0e25Snbeams         break; // No action
12607d8d0e25Snbeams       case CEED_EVAL_INTERP:
12617d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
12627d8d0e25Snbeams         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
12637d8d0e25Snbeams         code << "      }\n";
12647d8d0e25Snbeams         break;
12657d8d0e25Snbeams       case CEED_EVAL_GRAD:
12667d8d0e25Snbeams         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
12677d8d0e25Snbeams         break;
12687d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
12697d8d0e25Snbeams         break; // Should not occur
12707d8d0e25Snbeams       case CEED_EVAL_DIV:
12717d8d0e25Snbeams         break; // TODO: Not implemented
12727d8d0e25Snbeams       case CEED_EVAL_CURL:
12737d8d0e25Snbeams         break; // TODO: Not implemented
12747d8d0e25Snbeams       }
12757d8d0e25Snbeams     }
12767d8d0e25Snbeams     code << "    }\n";
12777d8d0e25Snbeams   }
12787d8d0e25Snbeams 
12797d8d0e25Snbeams   // Output basis apply if needed
12807d8d0e25Snbeams   // Generate the correct eval mode code for each output
12817d8d0e25Snbeams   code << "\n    // -- Output field basis action and restrictions --\n";
12827d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
12837d8d0e25Snbeams     code << "    // ---- Output field "<<i<<" ----\n";
12847d8d0e25Snbeams     // Get elemsize, emode, ncomp
12857d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1286e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
12877d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1288e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
12897d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1290e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
12917d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1292e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
12937d8d0e25Snbeams     // Basis action
12947d8d0e25Snbeams     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
12957d8d0e25Snbeams     switch (emode) {
12967d8d0e25Snbeams     case CEED_EVAL_NONE:
12977d8d0e25Snbeams       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
12987d8d0e25Snbeams       break; // No action
12997d8d0e25Snbeams     case CEED_EVAL_INTERP:
13007d8d0e25Snbeams       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
13017d8d0e25Snbeams       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
13027d8d0e25Snbeams       break;
13037d8d0e25Snbeams     case CEED_EVAL_GRAD:
13047d8d0e25Snbeams       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
13057d8d0e25Snbeams       if (useCollograd) {
13067d8d0e25Snbeams         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
13077d8d0e25Snbeams       } else {
13087d8d0e25Snbeams         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
13097d8d0e25Snbeams       }
13107d8d0e25Snbeams       break;
13117d8d0e25Snbeams     // LCOV_EXCL_START
13127d8d0e25Snbeams     case CEED_EVAL_WEIGHT: {
13137d8d0e25Snbeams       Ceed ceed;
1314e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1315e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
13167d8d0e25Snbeams                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
13177d8d0e25Snbeams       break; // Should not occur
13187d8d0e25Snbeams     }
13197d8d0e25Snbeams     case CEED_EVAL_DIV:
13207d8d0e25Snbeams       break; // TODO: Not implemented
13217d8d0e25Snbeams     case CEED_EVAL_CURL:
13227d8d0e25Snbeams       break; // TODO: Not implemented
13237d8d0e25Snbeams       // LCOV_EXCL_STOP
13247d8d0e25Snbeams     }
13257d8d0e25Snbeams     // Restriction
13267d8d0e25Snbeams       bool isStrided;
1327e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
13287d8d0e25Snbeams     if (!isStrided) {
13297d8d0e25Snbeams       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1330e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13317d8d0e25Snbeams       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
13327d8d0e25Snbeams       CeedInt compstride;
1333e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
13347d8d0e25Snbeams       code << "    // CompStride: "<<compstride<<"\n";
1335e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
13367d8d0e25Snbeams       data->indices.out[i] = restr_data->d_ind;
13377d8d0e25Snbeams       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
13387d8d0e25Snbeams     } else {
13397d8d0e25Snbeams       bool backendstrides;
13407d8d0e25Snbeams       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1341e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13427d8d0e25Snbeams       CeedInt nelem;
13437d8d0e25Snbeams       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1344e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13457d8d0e25Snbeams       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
13467d8d0e25Snbeams       if (!backendstrides) {
13477d8d0e25Snbeams         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1348e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
13497d8d0e25Snbeams       }
13507d8d0e25Snbeams       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
13517d8d0e25Snbeams       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
13527d8d0e25Snbeams     }
13537d8d0e25Snbeams   }
13547d8d0e25Snbeams 
13557d8d0e25Snbeams   code << "  }\n";
13567d8d0e25Snbeams   code << "}\n";
13577d8d0e25Snbeams   code << "// -----------------------------------------------------------------------------\n\n";
13587d8d0e25Snbeams 
13597d8d0e25Snbeams   // View kernel for debugging
1360*46dc0734SJeremy L Thompson   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
13613f21f6b1SJeremy L Thompson   CeedDebug(ceed, code.str().c_str());
13627d8d0e25Snbeams 
13637d8d0e25Snbeams   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 1,
13647d8d0e25Snbeams                          "T1d", CeedIntMax(Q1d, data->maxP1d));
1365e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
13667d8d0e25Snbeams   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1367e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
13687d8d0e25Snbeams 
1369e15f9bd0SJeremy L Thompson   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1370e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
13717d8d0e25Snbeams }
13727d8d0e25Snbeams //------------------------------------------------------------------------------
1373