xref: /libCEED/rust/libceed-sys/c-src/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 539ec17d7efe6a80c4ab8b3d6b91c3433981191e)
17d8d0e25Snbeams // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
27d8d0e25Snbeams // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
37d8d0e25Snbeams // All Rights reserved. See files LICENSE and NOTICE for details.
47d8d0e25Snbeams //
57d8d0e25Snbeams // This file is part of CEED, a collection of benchmarks, miniapps, software
67d8d0e25Snbeams // libraries and APIs for efficient high-order finite element and spectral
77d8d0e25Snbeams // element discretizations for exascale applications. For more information and
87d8d0e25Snbeams // source code availability see http://github.com/ceed.
97d8d0e25Snbeams //
107d8d0e25Snbeams // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
117d8d0e25Snbeams // a collaborative effort of two U.S. Department of Energy organizations (Office
127d8d0e25Snbeams // of Science and the National Nuclear Security Administration) responsible for
137d8d0e25Snbeams // the planning and preparation of a capable exascale ecosystem, including
147d8d0e25Snbeams // software, applications, hardware, advanced system engineering and early
157d8d0e25Snbeams // testbed platforms, in support of the nation's exascale computing imperative.
163d576824SJeremy L Thompson 
177d8d0e25Snbeams #define CEED_DEBUG_COLOR 12
187d8d0e25Snbeams 
19ec3da8bcSJed Brown #include <ceed/ceed.h>
20ec3da8bcSJed Brown #include <ceed/backend.h>
217d8d0e25Snbeams #include <iostream>
223d576824SJeremy L Thompson #include <string>
237d8d0e25Snbeams #include <sstream>
243d576824SJeremy L Thompson #include "ceed-hip-gen.h"
250d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h"
267d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h"
277d8d0e25Snbeams #include "../hip/ceed-hip-compile.h"
287d8d0e25Snbeams 
29b3e1519bSnbeams 
30b3e1519bSnbeams //------------------------------------------------------------------------------
31b3e1519bSnbeams // Calculate the block size used for launching the operator kernel
32b3e1519bSnbeams //------------------------------------------------------------------------------
3337c3b1cfSnbeams extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt nelem,
34b3e1519bSnbeams      	                                  const CeedInt P1d, const CeedInt Q1d,
35b3e1519bSnbeams 				          CeedInt *block_sizes) {
36b3e1519bSnbeams 
37b3e1519bSnbeams   const CeedInt thread1d = CeedIntMax(Q1d, P1d);
38b3e1519bSnbeams   if (dim==1) {
39b3e1519bSnbeams     CeedInt elemsPerBlock = 64*thread1d > 256? 256/thread1d : 64;
40b3e1519bSnbeams     elemsPerBlock = elemsPerBlock>0?elemsPerBlock:1;
41b3e1519bSnbeams     block_sizes[0] = thread1d;
42b3e1519bSnbeams     block_sizes[1] = 1;
43b3e1519bSnbeams     block_sizes[2] = elemsPerBlock;
44b3e1519bSnbeams   } else if (dim==2) {
45b3e1519bSnbeams     const CeedInt elemsPerBlock = thread1d<4? 16 : 2;
46b3e1519bSnbeams     block_sizes[0] = thread1d;
47b3e1519bSnbeams     block_sizes[1] = thread1d;
48b3e1519bSnbeams     block_sizes[2] = elemsPerBlock;
49b3e1519bSnbeams   } else if (dim==3) {
50b3e1519bSnbeams     const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1);
51b3e1519bSnbeams     block_sizes[0] = thread1d;
52b3e1519bSnbeams     block_sizes[1] = thread1d;
53b3e1519bSnbeams     block_sizes[2] = elemsPerBlock;
54b3e1519bSnbeams   }
55b3e1519bSnbeams   return CEED_ERROR_SUCCESS;
56b3e1519bSnbeams }
57b3e1519bSnbeams 
587d8d0e25Snbeams static const char *deviceFunctions = QUOTE(
597d8d0e25Snbeams 
607d8d0e25Snbeams //------------------------------------------------------------------------------
617d8d0e25Snbeams // Typedefs
627d8d0e25Snbeams //------------------------------------------------------------------------------
637d8d0e25Snbeams typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields;
647d8d0e25Snbeams typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt;
657d8d0e25Snbeams 
667d8d0e25Snbeams typedef struct {
677d8d0e25Snbeams   CeedInt tidx;
687d8d0e25Snbeams   CeedInt tidy;
697d8d0e25Snbeams   CeedInt tidz;
707d8d0e25Snbeams   CeedInt tid;
717d8d0e25Snbeams   CeedScalar* slice;
727d8d0e25Snbeams } BackendData;
737d8d0e25Snbeams 
747d8d0e25Snbeams //------------------------------------------------------------------------------
757d8d0e25Snbeams // Load matrices for basis actions
767d8d0e25Snbeams //------------------------------------------------------------------------------
777d8d0e25Snbeams template <int P, int Q>
787d8d0e25Snbeams inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
797d8d0e25Snbeams   for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
807d8d0e25Snbeams     B[i] = d_B[i];
817d8d0e25Snbeams }
827d8d0e25Snbeams 
837d8d0e25Snbeams //------------------------------------------------------------------------------
847d8d0e25Snbeams // 1D
857d8d0e25Snbeams //------------------------------------------------------------------------------
867d8d0e25Snbeams 
877d8d0e25Snbeams //------------------------------------------------------------------------------
887d8d0e25Snbeams // L-vector -> E-vector, offsets provided
897d8d0e25Snbeams //------------------------------------------------------------------------------
907d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
917d8d0e25Snbeams inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
927d8d0e25Snbeams   if (data.tidx < P1d) {
937d8d0e25Snbeams     const CeedInt node = data.tidx;
947d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d];
957d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
967d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
977d8d0e25Snbeams   }
987d8d0e25Snbeams }
997d8d0e25Snbeams 
1007d8d0e25Snbeams //------------------------------------------------------------------------------
1017d8d0e25Snbeams // L-vector -> E-vector, strided
1027d8d0e25Snbeams //------------------------------------------------------------------------------
1037d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
1047d8d0e25Snbeams inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
1057d8d0e25Snbeams   if (data.tidx < P1d) {
1067d8d0e25Snbeams     const CeedInt node = data.tidx;
1077d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
1087d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1097d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
1107d8d0e25Snbeams   }
1117d8d0e25Snbeams }
1127d8d0e25Snbeams 
1137d8d0e25Snbeams //------------------------------------------------------------------------------
1147d8d0e25Snbeams // E-vector -> L-vector, offsets provided
1157d8d0e25Snbeams //------------------------------------------------------------------------------
1167d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
1177d8d0e25Snbeams inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
1187d8d0e25Snbeams   if (data.tidx < P1d) {
1197d8d0e25Snbeams     const CeedInt node = data.tidx;
1207d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d];
1217d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1227d8d0e25Snbeams       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
1237d8d0e25Snbeams   }
1247d8d0e25Snbeams }
1257d8d0e25Snbeams 
1267d8d0e25Snbeams //------------------------------------------------------------------------------
1277d8d0e25Snbeams // E-vector -> L-vector, strided
1287d8d0e25Snbeams //------------------------------------------------------------------------------
1297d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
1307d8d0e25Snbeams inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
1317d8d0e25Snbeams   if (data.tidx < P1d) {
1327d8d0e25Snbeams     const CeedInt node = data.tidx;
1337d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
1347d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
1357d8d0e25Snbeams       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
1367d8d0e25Snbeams   }
1377d8d0e25Snbeams }
1387d8d0e25Snbeams 
1397d8d0e25Snbeams //------------------------------------------------------------------------------
1407d8d0e25Snbeams // 1D tensor contraction x
1417d8d0e25Snbeams //------------------------------------------------------------------------------
1427d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1437d8d0e25Snbeams inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1447d8d0e25Snbeams   data.slice[data.tidx] = *U;
1457d8d0e25Snbeams   __syncthreads();
1467d8d0e25Snbeams   *V = 0.0;
1477d8d0e25Snbeams   if (data.tidx < Q1d)
1487d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
1497d8d0e25Snbeams       *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction
1507d8d0e25Snbeams   __syncthreads();
1517d8d0e25Snbeams }
1527d8d0e25Snbeams 
1537d8d0e25Snbeams //------------------------------------------------------------------------------
1547d8d0e25Snbeams // 1D transpose tensor contraction x
1557d8d0e25Snbeams //------------------------------------------------------------------------------
1567d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1577d8d0e25Snbeams inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
1587d8d0e25Snbeams   data.slice[data.tidx] = *U;
1597d8d0e25Snbeams   __syncthreads();
1607d8d0e25Snbeams   *V = 0.0;
1617d8d0e25Snbeams   if (data.tidx < P1d)
1627d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
1637d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction
1647d8d0e25Snbeams   __syncthreads();
1657d8d0e25Snbeams }
1667d8d0e25Snbeams 
1677d8d0e25Snbeams //------------------------------------------------------------------------------
1687d8d0e25Snbeams // 1D interpolate to quadrature points
1697d8d0e25Snbeams //------------------------------------------------------------------------------
1707d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1717d8d0e25Snbeams inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
1727d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
1737d8d0e25Snbeams     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
1747d8d0e25Snbeams }
1757d8d0e25Snbeams 
1767d8d0e25Snbeams //------------------------------------------------------------------------------
1777d8d0e25Snbeams // 1D interpolate transpose
1787d8d0e25Snbeams //------------------------------------------------------------------------------
1797d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1807d8d0e25Snbeams inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
1817d8d0e25Snbeams   for (CeedInt comp=0; comp<NCOMP; comp++)
1827d8d0e25Snbeams     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp);
1837d8d0e25Snbeams }
1847d8d0e25Snbeams 
1857d8d0e25Snbeams //------------------------------------------------------------------------------
1867d8d0e25Snbeams // 1D derivatives at quadrature points
1877d8d0e25Snbeams //------------------------------------------------------------------------------
1887d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1897d8d0e25Snbeams inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
1907d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
1917d8d0e25Snbeams     ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
1927d8d0e25Snbeams }
1937d8d0e25Snbeams 
1947d8d0e25Snbeams //------------------------------------------------------------------------------
1957d8d0e25Snbeams // 1D derivatives transpose
1967d8d0e25Snbeams //------------------------------------------------------------------------------
1977d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
1987d8d0e25Snbeams inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
1997d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++)
2007d8d0e25Snbeams     ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp);
2017d8d0e25Snbeams }
2027d8d0e25Snbeams 
2037d8d0e25Snbeams //------------------------------------------------------------------------------
2047d8d0e25Snbeams // 2D
2057d8d0e25Snbeams //------------------------------------------------------------------------------
2067d8d0e25Snbeams 
2077d8d0e25Snbeams //------------------------------------------------------------------------------
2087d8d0e25Snbeams // L-vector -> E-vector, offsets provided
2097d8d0e25Snbeams //------------------------------------------------------------------------------
2107d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
2117d8d0e25Snbeams inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
2127d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2137d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2147d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d*P1d];
2157d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2167d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
2177d8d0e25Snbeams   }
2187d8d0e25Snbeams }
2197d8d0e25Snbeams 
2207d8d0e25Snbeams //------------------------------------------------------------------------------
2217d8d0e25Snbeams // L-vector -> E-vector, strided
2227d8d0e25Snbeams //------------------------------------------------------------------------------
2237d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
2247d8d0e25Snbeams inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
2257d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2267d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2277d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
2287d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2297d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
2307d8d0e25Snbeams   }
2317d8d0e25Snbeams }
2327d8d0e25Snbeams 
2337d8d0e25Snbeams //------------------------------------------------------------------------------
2347d8d0e25Snbeams // E-vector -> L-vector, offsets provided
2357d8d0e25Snbeams //------------------------------------------------------------------------------
2367d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
2377d8d0e25Snbeams inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
2387d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2397d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2407d8d0e25Snbeams     const CeedInt ind = indices[node + elem * P1d*P1d];
2417d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2427d8d0e25Snbeams       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
2437d8d0e25Snbeams   }
2447d8d0e25Snbeams }
2457d8d0e25Snbeams 
2467d8d0e25Snbeams //------------------------------------------------------------------------------
2477d8d0e25Snbeams // E-vector -> L-vector, strided
2487d8d0e25Snbeams //------------------------------------------------------------------------------
2497d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
2507d8d0e25Snbeams inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
2517d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d) {
2527d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*P1d;
2537d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
2547d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
2557d8d0e25Snbeams       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
2567d8d0e25Snbeams   }
2577d8d0e25Snbeams }
2587d8d0e25Snbeams 
2597d8d0e25Snbeams //------------------------------------------------------------------------------
2607d8d0e25Snbeams // 2D tensor contraction x
2617d8d0e25Snbeams //------------------------------------------------------------------------------
2627d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2637d8d0e25Snbeams inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2647d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2657d8d0e25Snbeams   __syncthreads();
2667d8d0e25Snbeams   *V = 0.0;
2677d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < P1d)
2687d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
2697d8d0e25Snbeams       *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
2707d8d0e25Snbeams   __syncthreads();
2717d8d0e25Snbeams }
2727d8d0e25Snbeams 
2737d8d0e25Snbeams //------------------------------------------------------------------------------
2747d8d0e25Snbeams // 2D tensor contract y
2757d8d0e25Snbeams //------------------------------------------------------------------------------
2767d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2777d8d0e25Snbeams inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2787d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2797d8d0e25Snbeams   __syncthreads();
2807d8d0e25Snbeams   *V = 0.0;
2817d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d)
2827d8d0e25Snbeams     for (CeedInt i = 0; i < P1d; ++i)
2837d8d0e25Snbeams       *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
2847d8d0e25Snbeams   __syncthreads();
2857d8d0e25Snbeams }
2867d8d0e25Snbeams 
2877d8d0e25Snbeams //------------------------------------------------------------------------------
2887d8d0e25Snbeams // 2D transpose tensor contract y
2897d8d0e25Snbeams //------------------------------------------------------------------------------
2907d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
2917d8d0e25Snbeams inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
2927d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
2937d8d0e25Snbeams   __syncthreads();
2947d8d0e25Snbeams   *V = 0.0;
2957d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < P1d)
2967d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
2977d8d0e25Snbeams       *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction
2987d8d0e25Snbeams   __syncthreads();
2997d8d0e25Snbeams }
3007d8d0e25Snbeams 
3017d8d0e25Snbeams //------------------------------------------------------------------------------
3027d8d0e25Snbeams // 2D transpose tensor contract x
3037d8d0e25Snbeams //------------------------------------------------------------------------------
3047d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3057d8d0e25Snbeams inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3067d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
3077d8d0e25Snbeams   __syncthreads();
3087d8d0e25Snbeams   *V = 0.0;
3097d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3107d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
3117d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
3127d8d0e25Snbeams   __syncthreads();
3137d8d0e25Snbeams }
3147d8d0e25Snbeams 
3157d8d0e25Snbeams //------------------------------------------------------------------------------
3167d8d0e25Snbeams // 2D transpose tensor contract and add x
3177d8d0e25Snbeams //------------------------------------------------------------------------------
3187d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3197d8d0e25Snbeams inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
3207d8d0e25Snbeams   data.slice[data.tidx+data.tidy*T1d] = *U;
3217d8d0e25Snbeams   __syncthreads();
3227d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3237d8d0e25Snbeams     for (CeedInt i = 0; i < Q1d; ++i)
3247d8d0e25Snbeams       *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction
3257d8d0e25Snbeams   __syncthreads();
3267d8d0e25Snbeams }
3277d8d0e25Snbeams 
3287d8d0e25Snbeams //------------------------------------------------------------------------------
3297d8d0e25Snbeams // 2D interpolate to quadrature points
3307d8d0e25Snbeams //------------------------------------------------------------------------------
3317d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3327d8d0e25Snbeams inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
3337d8d0e25Snbeams   CeedScalar r_t[1];
3347d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3357d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3367d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3377d8d0e25Snbeams   }
3387d8d0e25Snbeams }
3397d8d0e25Snbeams 
3407d8d0e25Snbeams //------------------------------------------------------------------------------
3417d8d0e25Snbeams // 2D interpolate transpose
3427d8d0e25Snbeams //------------------------------------------------------------------------------
3437d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3447d8d0e25Snbeams inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
3457d8d0e25Snbeams   CeedScalar r_t[1];
3467d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3477d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3487d8d0e25Snbeams     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3497d8d0e25Snbeams   }
3507d8d0e25Snbeams }
3517d8d0e25Snbeams 
3527d8d0e25Snbeams //------------------------------------------------------------------------------
3537d8d0e25Snbeams // 2D derivatives at quadrature points
3547d8d0e25Snbeams //------------------------------------------------------------------------------
3557d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3567d8d0e25Snbeams inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
3577d8d0e25Snbeams   CeedScalar r_t[1];
3587d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3597d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t);
3607d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP);
3617d8d0e25Snbeams     ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t);
3627d8d0e25Snbeams     ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP);
3637d8d0e25Snbeams   }
3647d8d0e25Snbeams }
3657d8d0e25Snbeams 
3667d8d0e25Snbeams //------------------------------------------------------------------------------
3677d8d0e25Snbeams // 2D derivatives transpose
3687d8d0e25Snbeams //------------------------------------------------------------------------------
3697d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
3707d8d0e25Snbeams inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
3717d8d0e25Snbeams   CeedScalar r_t[1];
3727d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
3737d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t);
3747d8d0e25Snbeams     ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp);
3757d8d0e25Snbeams     ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t);
3767d8d0e25Snbeams     ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp);
3777d8d0e25Snbeams   }
3787d8d0e25Snbeams }
3797d8d0e25Snbeams 
3807d8d0e25Snbeams //------------------------------------------------------------------------------
3817d8d0e25Snbeams // 3D
3827d8d0e25Snbeams //------------------------------------------------------------------------------
3837d8d0e25Snbeams 
3847d8d0e25Snbeams //------------------------------------------------------------------------------
3857d8d0e25Snbeams // L-vector -> E-vector, offsets provided
3867d8d0e25Snbeams //------------------------------------------------------------------------------
3877d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
3887d8d0e25Snbeams inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
3897d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
3907d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
3917d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
3927d8d0e25Snbeams       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
3937d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
3947d8d0e25Snbeams         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
3957d8d0e25Snbeams     }
3967d8d0e25Snbeams }
3977d8d0e25Snbeams 
3987d8d0e25Snbeams //------------------------------------------------------------------------------
3997d8d0e25Snbeams // L-vector -> E-vector, strided
4007d8d0e25Snbeams //------------------------------------------------------------------------------
4017d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4027d8d0e25Snbeams inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
4037d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
4047d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
4057d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4067d8d0e25Snbeams       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
4077d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4087d8d0e25Snbeams         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
4097d8d0e25Snbeams     }
4107d8d0e25Snbeams }
4117d8d0e25Snbeams 
4127d8d0e25Snbeams //------------------------------------------------------------------------------
4137d8d0e25Snbeams // E-vector -> Q-vector, offests provided
4147d8d0e25Snbeams //------------------------------------------------------------------------------
4157d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int Q1d>
4167d8d0e25Snbeams inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
4177d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
4187d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
4197d8d0e25Snbeams     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
4207d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4217d8d0e25Snbeams       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
4227d8d0e25Snbeams   }
4237d8d0e25Snbeams }
4247d8d0e25Snbeams 
4257d8d0e25Snbeams //------------------------------------------------------------------------------
4267d8d0e25Snbeams // E-vector -> Q-vector, strided
4277d8d0e25Snbeams //------------------------------------------------------------------------------
4287d8d0e25Snbeams template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4297d8d0e25Snbeams inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
4307d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
4317d8d0e25Snbeams     const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d;
4327d8d0e25Snbeams     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
4337d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp)
4347d8d0e25Snbeams       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
4357d8d0e25Snbeams   }
4367d8d0e25Snbeams }
4377d8d0e25Snbeams 
4387d8d0e25Snbeams //------------------------------------------------------------------------------
4397d8d0e25Snbeams // E-vector -> L-vector, offsets provided
4407d8d0e25Snbeams //------------------------------------------------------------------------------
4417d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d>
4427d8d0e25Snbeams inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
4437d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
4447d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
4457d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4467d8d0e25Snbeams       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
4477d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4487d8d0e25Snbeams         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
4497d8d0e25Snbeams     }
4507d8d0e25Snbeams }
4517d8d0e25Snbeams 
4527d8d0e25Snbeams //------------------------------------------------------------------------------
4537d8d0e25Snbeams // E-vector -> L-vector, strided
4547d8d0e25Snbeams //------------------------------------------------------------------------------
4557d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
4567d8d0e25Snbeams inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
4577d8d0e25Snbeams   if (data.tidx < P1d && data.tidy < P1d)
4587d8d0e25Snbeams     for (CeedInt z = 0; z < P1d; ++z) {
4597d8d0e25Snbeams       const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d;
4607d8d0e25Snbeams       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
4617d8d0e25Snbeams       for (CeedInt comp = 0; comp < NCOMP; ++comp)
4627d8d0e25Snbeams         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
4637d8d0e25Snbeams     }
4647d8d0e25Snbeams }
4657d8d0e25Snbeams 
4667d8d0e25Snbeams //------------------------------------------------------------------------------
4677d8d0e25Snbeams // 3D tensor contract x
4687d8d0e25Snbeams //------------------------------------------------------------------------------
4697d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
4707d8d0e25Snbeams inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4717d8d0e25Snbeams   CeedScalar r_B[P1d];
4727d8d0e25Snbeams   for (CeedInt i = 0; i < P1d; ++i)
4737d8d0e25Snbeams     r_B[i] = B[i + data.tidx*P1d];
4747d8d0e25Snbeams 
4757d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
4767d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
4777d8d0e25Snbeams     __syncthreads();
4787d8d0e25Snbeams     V[k] = 0.0;
4797d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
4807d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
4817d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
4827d8d0e25Snbeams     __syncthreads();
4837d8d0e25Snbeams   }
4847d8d0e25Snbeams }
4857d8d0e25Snbeams 
4867d8d0e25Snbeams //------------------------------------------------------------------------------
4877d8d0e25Snbeams // 3D tensor contract y
4887d8d0e25Snbeams //------------------------------------------------------------------------------
4897d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
4907d8d0e25Snbeams inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
4917d8d0e25Snbeams   CeedScalar r_B[P1d];
4927d8d0e25Snbeams   for (CeedInt i = 0; i < P1d; ++i)
4937d8d0e25Snbeams     r_B[i] = B[i + data.tidy*P1d];
4947d8d0e25Snbeams 
4957d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
4967d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
4977d8d0e25Snbeams     __syncthreads();
4987d8d0e25Snbeams     V[k] = 0.0;
4997d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
5007d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
5017d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
5027d8d0e25Snbeams     __syncthreads();
5037d8d0e25Snbeams   }
5047d8d0e25Snbeams }
5057d8d0e25Snbeams 
5067d8d0e25Snbeams //------------------------------------------------------------------------------
5077d8d0e25Snbeams // 3D tensor contract z
5087d8d0e25Snbeams //------------------------------------------------------------------------------
5097d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5107d8d0e25Snbeams inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5117d8d0e25Snbeams   for (CeedInt k = 0; k < Q1d; ++k) {
5127d8d0e25Snbeams     V[k] = 0.0;
5137d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
5147d8d0e25Snbeams       for (CeedInt i = 0; i < P1d; ++i)
5157d8d0e25Snbeams         V[k] += B[i + k*P1d] * U[i]; // Contract z direction
5167d8d0e25Snbeams   }
5177d8d0e25Snbeams }
5187d8d0e25Snbeams 
5197d8d0e25Snbeams //------------------------------------------------------------------------------
5207d8d0e25Snbeams // 3D transpose tensor contract z
5217d8d0e25Snbeams //------------------------------------------------------------------------------
5227d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5237d8d0e25Snbeams inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5247d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5257d8d0e25Snbeams     V[k] = 0.0;
5267d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < Q1d)
5277d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5287d8d0e25Snbeams         V[k] += B[k + i*P1d] * U[i]; // Contract z direction
5297d8d0e25Snbeams   }
5307d8d0e25Snbeams }
5317d8d0e25Snbeams 
5327d8d0e25Snbeams //------------------------------------------------------------------------------
5337d8d0e25Snbeams // 3D transpose tensor contract y
5347d8d0e25Snbeams //------------------------------------------------------------------------------
5357d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5367d8d0e25Snbeams inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5377d8d0e25Snbeams   CeedScalar r_B[Q1d];
5387d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5397d8d0e25Snbeams     r_B[i] = B[data.tidy + i*P1d];
5407d8d0e25Snbeams 
5417d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5427d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5437d8d0e25Snbeams     __syncthreads();
5447d8d0e25Snbeams     V[k] = 0.0;
5457d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
5467d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5477d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
5487d8d0e25Snbeams     __syncthreads();
5497d8d0e25Snbeams   }
5507d8d0e25Snbeams }
5517d8d0e25Snbeams 
5527d8d0e25Snbeams //------------------------------------------------------------------------------
5537d8d0e25Snbeams // 3D transpose tensor contract add y
5547d8d0e25Snbeams //------------------------------------------------------------------------------
5557d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5567d8d0e25Snbeams inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5577d8d0e25Snbeams   CeedScalar r_B[Q1d];
5587d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5597d8d0e25Snbeams     r_B[i] = B[data.tidy + i*P1d];
5607d8d0e25Snbeams 
5617d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5627d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5637d8d0e25Snbeams     __syncthreads();
5647d8d0e25Snbeams     if (data.tidx < Q1d && data.tidy < P1d)
5657d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5667d8d0e25Snbeams         V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction
5677d8d0e25Snbeams     __syncthreads();
5687d8d0e25Snbeams   }
5697d8d0e25Snbeams }
5707d8d0e25Snbeams 
5717d8d0e25Snbeams //------------------------------------------------------------------------------
5727d8d0e25Snbeams // 3D transpose tensor contract x
5737d8d0e25Snbeams //------------------------------------------------------------------------------
5747d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5757d8d0e25Snbeams inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5767d8d0e25Snbeams   CeedScalar r_B[Q1d];
5777d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5787d8d0e25Snbeams     r_B[i] = B[data.tidx + i*P1d];
5797d8d0e25Snbeams 
5807d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
5817d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
5827d8d0e25Snbeams     __syncthreads();
5837d8d0e25Snbeams     V[k] = 0.0;
5847d8d0e25Snbeams     if (data.tidx < P1d && data.tidy < P1d)
5857d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
5867d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
5877d8d0e25Snbeams     __syncthreads();
5887d8d0e25Snbeams   }
5897d8d0e25Snbeams }
5907d8d0e25Snbeams 
5917d8d0e25Snbeams //------------------------------------------------------------------------------
5927d8d0e25Snbeams // 3D transpose tensor contract add x
5937d8d0e25Snbeams //------------------------------------------------------------------------------
5947d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
5957d8d0e25Snbeams inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
5967d8d0e25Snbeams   CeedScalar r_B[Q1d];
5977d8d0e25Snbeams   for (CeedInt i = 0; i < Q1d; ++i)
5987d8d0e25Snbeams     r_B[i] = B[data.tidx + i*P1d];
5997d8d0e25Snbeams 
6007d8d0e25Snbeams   for (CeedInt k = 0; k < P1d; ++k) {
6017d8d0e25Snbeams     data.slice[data.tidx+data.tidy*T1d] = U[k];
6027d8d0e25Snbeams     __syncthreads();
6037d8d0e25Snbeams     if (data.tidx < P1d && data.tidy < P1d)
6047d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6057d8d0e25Snbeams         V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction
6067d8d0e25Snbeams     __syncthreads();
6077d8d0e25Snbeams   }
6087d8d0e25Snbeams }
6097d8d0e25Snbeams 
6107d8d0e25Snbeams //------------------------------------------------------------------------------
6117d8d0e25Snbeams // 3D interpolate to quadrature points
6127d8d0e25Snbeams //------------------------------------------------------------------------------
6137d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6147d8d0e25Snbeams inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
6157d8d0e25Snbeams   CeedScalar r_t1[T1d];
6167d8d0e25Snbeams   CeedScalar r_t2[T1d];
6177d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6187d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
6197d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6207d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d);
6217d8d0e25Snbeams   }
6227d8d0e25Snbeams }
6237d8d0e25Snbeams 
6247d8d0e25Snbeams //------------------------------------------------------------------------------
6257d8d0e25Snbeams // 3D interpolate transpose
6267d8d0e25Snbeams //------------------------------------------------------------------------------
6277d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6287d8d0e25Snbeams inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
6297d8d0e25Snbeams   CeedScalar r_t1[T1d];
6307d8d0e25Snbeams   CeedScalar r_t2[T1d];
6317d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6327d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1);
6337d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6347d8d0e25Snbeams     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6357d8d0e25Snbeams   }
6367d8d0e25Snbeams }
6377d8d0e25Snbeams 
6387d8d0e25Snbeams //------------------------------------------------------------------------------
6397d8d0e25Snbeams // 3D derivatives at quadrature points
6407d8d0e25Snbeams //------------------------------------------------------------------------------
6417d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6427d8d0e25Snbeams inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6437d8d0e25Snbeams   CeedScalar r_t1[T1d];
6447d8d0e25Snbeams   CeedScalar r_t2[T1d];
6457d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6467d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1);
6477d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6487d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d);
6497d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
6507d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
6517d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d);
6527d8d0e25Snbeams     ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1);
6537d8d0e25Snbeams     ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6547d8d0e25Snbeams     ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d);
6557d8d0e25Snbeams   }
6567d8d0e25Snbeams }
6577d8d0e25Snbeams 
6587d8d0e25Snbeams //------------------------------------------------------------------------------
6597d8d0e25Snbeams // 3D derivatives transpose
6607d8d0e25Snbeams //------------------------------------------------------------------------------
6617d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d>
6627d8d0e25Snbeams inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6637d8d0e25Snbeams   CeedScalar r_t1[T1d];
6647d8d0e25Snbeams   CeedScalar r_t2[T1d];
6657d8d0e25Snbeams   for (CeedInt comp = 0; comp < NCOMP; comp++) {
6667d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1);
6677d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6687d8d0e25Snbeams     ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d);
6697d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1);
6707d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2);
6717d8d0e25Snbeams     ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6727d8d0e25Snbeams     ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1);
6737d8d0e25Snbeams     ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2);
6747d8d0e25Snbeams     ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d);
6757d8d0e25Snbeams   }
6767d8d0e25Snbeams }
6777d8d0e25Snbeams 
6787d8d0e25Snbeams //------------------------------------------------------------------------------
6797d8d0e25Snbeams // 3D collocated derivatives computation
6807d8d0e25Snbeams //------------------------------------------------------------------------------
6817d8d0e25Snbeams template <int NCOMP, int Q1d>
6827d8d0e25Snbeams inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6837d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
6847d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
6857d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d];
6867d8d0e25Snbeams       __syncthreads();
6877d8d0e25Snbeams       // X derivative
6887d8d0e25Snbeams       r_V[comp+0*NCOMP] = 0.0;
6897d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6907d8d0e25Snbeams         r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
6917d8d0e25Snbeams       // Y derivative
6927d8d0e25Snbeams       r_V[comp+1*NCOMP] = 0.0;
6937d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6947d8d0e25Snbeams         r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
6957d8d0e25Snbeams       // Z derivative
6967d8d0e25Snbeams       r_V[comp+2*NCOMP] = 0.0;
6977d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
6987d8d0e25Snbeams         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
6997d8d0e25Snbeams       __syncthreads();
7007d8d0e25Snbeams     }
7017d8d0e25Snbeams   }
7027d8d0e25Snbeams }
7037d8d0e25Snbeams 
7047d8d0e25Snbeams //------------------------------------------------------------------------------
7057d8d0e25Snbeams // 3D collocated derivatives transpose
7067d8d0e25Snbeams //------------------------------------------------------------------------------
7077d8d0e25Snbeams template <int NCOMP, int Q1d>
7087d8d0e25Snbeams inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
7097d8d0e25Snbeams   if (data.tidx < Q1d && data.tidy < Q1d) {
7107d8d0e25Snbeams     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
7117d8d0e25Snbeams       // X derivative
7127d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP];
7137d8d0e25Snbeams       __syncthreads();
7147d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
7157d8d0e25Snbeams         r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative)
7167d8d0e25Snbeams       __syncthreads();
7177d8d0e25Snbeams       // Y derivative
7187d8d0e25Snbeams       data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP];
7197d8d0e25Snbeams       __syncthreads();
7207d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
7217d8d0e25Snbeams         r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative)
7227d8d0e25Snbeams       __syncthreads();
7237d8d0e25Snbeams       // Z derivative
7247d8d0e25Snbeams       for (CeedInt i = 0; i < Q1d; ++i)
7257d8d0e25Snbeams         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
7267d8d0e25Snbeams     }
7277d8d0e25Snbeams   }
7287d8d0e25Snbeams }
7297d8d0e25Snbeams 
7307d8d0e25Snbeams //------------------------------------------------------------------------------
7317d8d0e25Snbeams // 1D quadrature weights
7327d8d0e25Snbeams //------------------------------------------------------------------------------
7337d8d0e25Snbeams template <int Q1d>
7347d8d0e25Snbeams inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7357d8d0e25Snbeams   *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0;
7367d8d0e25Snbeams }
7377d8d0e25Snbeams 
7387d8d0e25Snbeams //------------------------------------------------------------------------------
7397d8d0e25Snbeams // 2D quadrature weights
7407d8d0e25Snbeams //------------------------------------------------------------------------------
7417d8d0e25Snbeams template <int Q1d>
7427d8d0e25Snbeams inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7437d8d0e25Snbeams   *w = (data.tidx < Q1d && data.tidy < Q1d) ?
7447d8d0e25Snbeams         qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
7457d8d0e25Snbeams }
7467d8d0e25Snbeams 
7477d8d0e25Snbeams //------------------------------------------------------------------------------
7487d8d0e25Snbeams // 3D quadrature weights
7497d8d0e25Snbeams //------------------------------------------------------------------------------
7507d8d0e25Snbeams template <int Q1d>
7517d8d0e25Snbeams inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
7527d8d0e25Snbeams   const bool quad = (data.tidx < Q1d && data.tidy < Q1d);
7537d8d0e25Snbeams   const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0;
7547d8d0e25Snbeams   for (CeedInt z = 0; z < Q1d; ++z)
7557d8d0e25Snbeams     w[z] = quad ? pw*qweight1d[z] : 0.0;
7567d8d0e25Snbeams }
7577d8d0e25Snbeams 
7587d8d0e25Snbeams );
7597d8d0e25Snbeams //------------------------------------------------------------------------------
7607d8d0e25Snbeams // Build singe operator kernel
7617d8d0e25Snbeams //------------------------------------------------------------------------------
76237c3b1cfSnbeams extern "C" int CeedHipGenOperatorBuild(CeedOperator op) {
7637d8d0e25Snbeams 
7647d8d0e25Snbeams   using std::ostringstream;
7657d8d0e25Snbeams   using std::string;
7667d8d0e25Snbeams   int ierr;
7677d8d0e25Snbeams   bool setupdone;
768e15f9bd0SJeremy L Thompson   ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr);
769e15f9bd0SJeremy L Thompson   if (setupdone) return CEED_ERROR_SUCCESS;
7707d8d0e25Snbeams   Ceed ceed;
771e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
7727d8d0e25Snbeams   CeedOperator_Hip_gen *data;
773e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr);
7747d8d0e25Snbeams   CeedQFunction qf;
7757d8d0e25Snbeams   CeedQFunction_Hip_gen *qf_data;
776e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr);
777e15f9bd0SJeremy L Thompson   ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr);
778e79b91d9SJeremy L Thompson   CeedSize lsize;
77978464608Sjeremylt   CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields,
780e79b91d9SJeremy L Thompson           numoutputfields, ncomp, dim = 0;
781e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr);
782e15f9bd0SJeremy L Thompson   ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr);
7837d8d0e25Snbeams   CeedOperatorField *opinputfields, *opoutputfields;
7847e7773b5SJeremy L Thompson   ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields);
785e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
7867d8d0e25Snbeams   CeedQFunctionField *qfinputfields, *qfoutputfields;
7877e7773b5SJeremy L Thompson   ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields);
788e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
7897d8d0e25Snbeams   CeedEvalMode emode;
7907d8d0e25Snbeams   CeedBasis basis;
7917d8d0e25Snbeams   CeedBasis_Hip_shared *basis_data;
7927d8d0e25Snbeams   CeedElemRestriction Erestrict;
7937d8d0e25Snbeams   CeedElemRestriction_Hip *restr_data;
7947d8d0e25Snbeams 
7950b454692Sjeremylt   // Check for restriction only identity operator
7960b454692Sjeremylt   bool is_identity_qf;
7970b454692Sjeremylt   ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr);
7980b454692Sjeremylt   if (is_identity_qf) {
7990b454692Sjeremylt     CeedEvalMode emodein, emodeout;
8000b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein);  CeedChkBackend(ierr);
8010b454692Sjeremylt     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout);  CeedChkBackend(ierr);
8020b454692Sjeremylt     if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE)
8030b454692Sjeremylt       // LCOV_EXCL_START
8040b454692Sjeremylt       return CeedError(ceed, CEED_ERROR_BACKEND,
8050b454692Sjeremylt                        "Backend does not implement restriction only identity operators");
8060b454692Sjeremylt     // LCOV_EXCL_STOP
8070b454692Sjeremylt   }
8080b454692Sjeremylt 
8097d8d0e25Snbeams   ostringstream code;
8107d8d0e25Snbeams   string devFunctions(deviceFunctions);
8117d8d0e25Snbeams   code << devFunctions;
8127d8d0e25Snbeams 
8137d8d0e25Snbeams   string qFunction(qf_data->qFunctionSource);
8147d8d0e25Snbeams   string qFunctionName(qf_data->qFunctionName);
8157d8d0e25Snbeams   string oper;
8167d8d0e25Snbeams   oper = "CeedKernel_Hip_gen_" + qFunctionName;
8177d8d0e25Snbeams 
8187d8d0e25Snbeams   code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n";
81977841947SLeila Ghaffari   code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n";
820e15f9bd0SJeremy L Thompson   code << "#define CeedPragmaSIMD\n";
821e15f9bd0SJeremy L Thompson   code << "#define CEED_ERROR_SUCCESS 0\n\n";
8227d8d0e25Snbeams 
8237d8d0e25Snbeams   // Find dim and Q1d
8247d8d0e25Snbeams   bool useCollograd = true;
8257d8d0e25Snbeams   data->maxP1d = 0;
8267d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
827e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
8287d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
829e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8307d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
831e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8327d8d0e25Snbeams 
8337d8d0e25Snbeams       // Check for collocated gradient
834437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8357d8d0e25Snbeams 
8367d8d0e25Snbeams       // Collect dim and Q1d
837e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8387d8d0e25Snbeams       bool isTensor;
839e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8407d8d0e25Snbeams       if (isTensor) {
841e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
842e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
8437d8d0e25Snbeams         if (P1d>data->maxP1d) data->maxP1d = P1d;
8447d8d0e25Snbeams       } else {
8457d8d0e25Snbeams         // LCOV_EXCL_START
846e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
8477d8d0e25Snbeams         // LCOV_EXCL_STOP
8487d8d0e25Snbeams         }
8497d8d0e25Snbeams     }
8507d8d0e25Snbeams   }
8517d8d0e25Snbeams   // Check output bases for Q1d, dim as well
8527d8d0e25Snbeams   //   The only imput basis might be CEED_BASIS_COLLOCATED
8537d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
854e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
8557d8d0e25Snbeams 
8567d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
857e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
8587d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
859e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
8607d8d0e25Snbeams 
8617d8d0e25Snbeams       // Collect dim and Q1d
862e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
8637d8d0e25Snbeams       bool isTensor;
864e15f9bd0SJeremy L Thompson       ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr);
8657d8d0e25Snbeams       if (isTensor) {
866e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr);
8677d8d0e25Snbeams       } else {
8687d8d0e25Snbeams         // LCOV_EXCL_START
869e15f9bd0SJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
8707d8d0e25Snbeams         // LCOV_EXCL_STOP
8717d8d0e25Snbeams         }
8727d8d0e25Snbeams 
8737d8d0e25Snbeams       // Check for collocated gradient
874437930d1SJeremy L Thompson       useCollograd = useCollograd && basis_data->d_collo_grad_1d;
8757d8d0e25Snbeams     }
8767d8d0e25Snbeams   }
8777d8d0e25Snbeams   data->dim = dim;
8787d8d0e25Snbeams   data->Q1d = Q1d;
8797d8d0e25Snbeams 
8807d8d0e25Snbeams   // Define CEED_Q_VLA
8817d8d0e25Snbeams   if (dim != 3 || useCollograd) {
8827d8d0e25Snbeams     code << "\n#define CEED_Q_VLA 1\n\n";
8837d8d0e25Snbeams   } else {
8847d8d0e25Snbeams     code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n";
8857d8d0e25Snbeams   }
8867d8d0e25Snbeams 
8877d8d0e25Snbeams   code << qFunction;
8887d8d0e25Snbeams 
8897d8d0e25Snbeams   // Setup
8907d8d0e25Snbeams   code << "\n// -----------------------------------------------------------------------------\n";
891b3e1519bSnbeams   code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
892b3e1519bSnbeams   code << "__global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n";
8937d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
8947d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
895e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
8967d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
8977d8d0e25Snbeams       code << "  const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
8987d8d0e25Snbeams     }
8997d8d0e25Snbeams   }
9007d8d0e25Snbeams 
9017d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
9027d8d0e25Snbeams     code << "  CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
9037d8d0e25Snbeams   }
9047d8d0e25Snbeams 
9057d8d0e25Snbeams   code << "  const CeedInt Dim = "<<dim<<";\n";
9067d8d0e25Snbeams   code << "  const CeedInt Q1d = "<<Q1d<<";\n";
9077d8d0e25Snbeams 
9087d8d0e25Snbeams   code << "  HIP_DYNAMIC_SHARED( CeedScalar, slice)\n";
9097d8d0e25Snbeams   code << "  BackendData data;\n";
9107d8d0e25Snbeams   code << "  data.tidx = threadIdx.x;\n";
9117d8d0e25Snbeams   code << "  data.tidy = threadIdx.y;\n";
9127d8d0e25Snbeams   code << "  data.tidz = threadIdx.z;\n";
9137d8d0e25Snbeams   code << "  data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
9147d8d0e25Snbeams   code << "  data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n";
9157d8d0e25Snbeams 
9167d8d0e25Snbeams   code << "\n  // -- Input field constants and basis data --\n";
9177d8d0e25Snbeams   //Initialize constants, and matrices B and G
9187d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
9197d8d0e25Snbeams     code << "  // ---- Input field "<<i<<" ----\n";
9207d8d0e25Snbeams     // Get elemsize, emode, ncomp
9217d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
922e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9237d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
924e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9257d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
926e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9277d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
928e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9297d8d0e25Snbeams 
9307d8d0e25Snbeams     // Set field constants
9317d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT) {
932e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
9337d8d0e25Snbeams       if (basis != CEED_BASIS_COLLOCATED) {
934e15f9bd0SJeremy L Thompson         ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
9357d8d0e25Snbeams         code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
9367d8d0e25Snbeams       } else {
9377d8d0e25Snbeams         code << "  const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n";
9387d8d0e25Snbeams       }
9397d8d0e25Snbeams       code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
9407d8d0e25Snbeams     }
9417d8d0e25Snbeams 
9427d8d0e25Snbeams     // Load basis data
9437d8d0e25Snbeams     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
9447d8d0e25Snbeams     switch (emode) {
9457d8d0e25Snbeams     case CEED_EVAL_NONE:
9467d8d0e25Snbeams       break;
9477d8d0e25Snbeams     case CEED_EVAL_INTERP:
948e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
949437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
95080a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9517d8d0e25Snbeams       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
9527d8d0e25Snbeams       break;
9537d8d0e25Snbeams     case CEED_EVAL_GRAD:
954e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
955437930d1SJeremy L Thompson       data->B.in[i] = basis_data->d_interp_1d;
95680a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9577d8d0e25Snbeams       code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
9587d8d0e25Snbeams       if (useCollograd) {
959437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_collo_grad_1d;
96080a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n";
9617d8d0e25Snbeams         code << "  loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
9627d8d0e25Snbeams       } else {
963437930d1SJeremy L Thompson         data->G.in[i] = basis_data->d_grad_1d;
96480a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
9657d8d0e25Snbeams         code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
9667d8d0e25Snbeams       }
9677d8d0e25Snbeams       break;
9687d8d0e25Snbeams     case CEED_EVAL_WEIGHT:
9697d8d0e25Snbeams       break; // No action
9707d8d0e25Snbeams     case CEED_EVAL_DIV:
9717d8d0e25Snbeams       break; // TODO: Not implemented
9727d8d0e25Snbeams     case CEED_EVAL_CURL:
9737d8d0e25Snbeams       break; // TODO: Not implemented
9747d8d0e25Snbeams     }
9757d8d0e25Snbeams   }
9767d8d0e25Snbeams 
9777d8d0e25Snbeams   code << "\n  // -- Output field constants and basis data --\n";
9787d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
9797d8d0e25Snbeams     code << "  // ---- Output field "<<i<<" ----\n";
9807d8d0e25Snbeams     // Get elemsize, emode, ncomp
9817d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
982e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9837d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
984e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9857d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
986e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9877d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
988e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
9897d8d0e25Snbeams 
9907d8d0e25Snbeams     // Set field constants
991e15f9bd0SJeremy L Thompson     ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr);
9927d8d0e25Snbeams     if (basis != CEED_BASIS_COLLOCATED) {
993e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr);
9947d8d0e25Snbeams       code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
9957d8d0e25Snbeams     } else {
9967d8d0e25Snbeams       code << "  const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n";
9977d8d0e25Snbeams     }
9987d8d0e25Snbeams     code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
9997d8d0e25Snbeams 
10007d8d0e25Snbeams     // Load basis data
10017d8d0e25Snbeams     code << "  // EvalMode: "<<CeedEvalModes[emode]<<"\n";
10027d8d0e25Snbeams     switch (emode) {
10037d8d0e25Snbeams     case CEED_EVAL_NONE:
10047d8d0e25Snbeams       break; // No action
10057d8d0e25Snbeams     case CEED_EVAL_INTERP:
1006e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1007437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
100880a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
10097d8d0e25Snbeams       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
10107d8d0e25Snbeams       break;
10117d8d0e25Snbeams     case CEED_EVAL_GRAD:
1012e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1013437930d1SJeremy L Thompson       data->B.out[i] = basis_data->d_interp_1d;
101480a9ef05SNatalie Beams       code << "  __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
10157d8d0e25Snbeams       code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
10167d8d0e25Snbeams       if (useCollograd) {
1017437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_collo_grad_1d;
101880a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n";
10197d8d0e25Snbeams         code << "  loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
10207d8d0e25Snbeams       } else {
1021437930d1SJeremy L Thompson         data->G.out[i] = basis_data->d_grad_1d;
102280a9ef05SNatalie Beams         code << "  __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
10237d8d0e25Snbeams         code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
10247d8d0e25Snbeams       }
10257d8d0e25Snbeams       break;
10267d8d0e25Snbeams     // LCOV_EXCL_START
10277d8d0e25Snbeams     case CEED_EVAL_WEIGHT: {
10287d8d0e25Snbeams       Ceed ceed;
1029e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1030e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
10317d8d0e25Snbeams                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
10327d8d0e25Snbeams       break; // Should not occur
10337d8d0e25Snbeams     }
10347d8d0e25Snbeams     case CEED_EVAL_DIV:
10357d8d0e25Snbeams       break; // TODO: Not implemented
10367d8d0e25Snbeams     case CEED_EVAL_CURL:
10377d8d0e25Snbeams       break; // TODO: Not implemented
10387d8d0e25Snbeams       // LCOV_EXCL_STOP
10397d8d0e25Snbeams     }
10407d8d0e25Snbeams   }
10417d8d0e25Snbeams   code << "\n  // -- Element loop --\n";
10427d8d0e25Snbeams   code << "  __syncthreads();\n";
10437d8d0e25Snbeams   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
10447d8d0e25Snbeams   // Input basis apply if needed
10457d8d0e25Snbeams   // Generate the correct eval mode code for each input
10467d8d0e25Snbeams   code << "    // -- Input field restrictions and basis actions --\n";
10477d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
10487d8d0e25Snbeams     code << "    // ---- Input field "<<i<<" ----\n";
10497d8d0e25Snbeams     // Get elemsize, emode, ncomp
10507d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
1051e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10527d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1053e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10547d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1055e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10567d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1057e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
10587d8d0e25Snbeams 
10597d8d0e25Snbeams     // Restriction
10607d8d0e25Snbeams     if (emode != CEED_EVAL_WEIGHT &&
10617d8d0e25Snbeams         !((emode == CEED_EVAL_NONE) && useCollograd)) {
10627d8d0e25Snbeams       code << "    CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
10637d8d0e25Snbeams 
10647d8d0e25Snbeams       bool isStrided;
1065e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
10667d8d0e25Snbeams       if (!isStrided) {
10677d8d0e25Snbeams         ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1068e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10697d8d0e25Snbeams         code << "    const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
10707d8d0e25Snbeams         CeedInt compstride;
1071e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
10727d8d0e25Snbeams         code << "    // CompStride: "<<compstride<<"\n";
1073e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
10747d8d0e25Snbeams         data->indices.in[i] = restr_data->d_ind;
10757d8d0e25Snbeams         code << "    readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
10767d8d0e25Snbeams       } else {
10777d8d0e25Snbeams         bool backendstrides;
10787d8d0e25Snbeams         ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1079e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10807d8d0e25Snbeams         CeedInt nelem;
10817d8d0e25Snbeams         ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1082e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
10837d8d0e25Snbeams         CeedInt strides[3] = {1, elemsize*nelem, elemsize};
10847d8d0e25Snbeams         if (!backendstrides) {
10857d8d0e25Snbeams           ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1086e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
10877d8d0e25Snbeams         }
10887d8d0e25Snbeams         code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
10897d8d0e25Snbeams         code << "    readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n";
10907d8d0e25Snbeams       }
10917d8d0e25Snbeams     }
10927d8d0e25Snbeams 
10937d8d0e25Snbeams     // Basis action
10947d8d0e25Snbeams     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
10957d8d0e25Snbeams     switch (emode) {
10967d8d0e25Snbeams     case CEED_EVAL_NONE:
10977d8d0e25Snbeams       if (!useCollograd) {
10987d8d0e25Snbeams         code << "    CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n";
10997d8d0e25Snbeams       }
11007d8d0e25Snbeams       break;
11017d8d0e25Snbeams     case CEED_EVAL_INTERP:
11027d8d0e25Snbeams       code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
11037d8d0e25Snbeams       code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
11047d8d0e25Snbeams       break;
11057d8d0e25Snbeams     case CEED_EVAL_GRAD:
11067d8d0e25Snbeams       if (useCollograd) {
11077d8d0e25Snbeams         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
11087d8d0e25Snbeams         code << "    interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
11097d8d0e25Snbeams       } else {
11107d8d0e25Snbeams         code << "    CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
11117d8d0e25Snbeams         code << "    grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
11127d8d0e25Snbeams       }
11137d8d0e25Snbeams       break;
11147d8d0e25Snbeams     case CEED_EVAL_WEIGHT:
11157d8d0e25Snbeams       code << "    CeedScalar r_t"<<i<<"[Q1d];\n";
1116e15f9bd0SJeremy L Thompson       ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr);
1117e15f9bd0SJeremy L Thompson       ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr);
1118437930d1SJeremy L Thompson       data->W = basis_data->d_q_weight_1d;
11197d8d0e25Snbeams       code << "    weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
11207d8d0e25Snbeams       break; // No action
11217d8d0e25Snbeams     case CEED_EVAL_DIV:
11227d8d0e25Snbeams       break; // TODO: Not implemented
11237d8d0e25Snbeams     case CEED_EVAL_CURL:
11247d8d0e25Snbeams       break; // TODO: Not implemented
11257d8d0e25Snbeams     }
11267d8d0e25Snbeams   }
11277d8d0e25Snbeams 
11287d8d0e25Snbeams   // Q function
11297d8d0e25Snbeams   code << "\n    // -- Output field setup --\n";
11307d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
11317d8d0e25Snbeams       code << "\n    // ---- Output field "<<i<<" ----\n";
11327d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1133e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
11347d8d0e25Snbeams     if (emode==CEED_EVAL_GRAD)
11357d8d0e25Snbeams     {
11367d8d0e25Snbeams       if (useCollograd) {
11377d8d0e25Snbeams         //Accumulator for gradient slices
11387d8d0e25Snbeams         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
11397d8d0e25Snbeams         code << "    for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n";
11407d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < Q1d; ++j) {\n";
11417d8d0e25Snbeams         code << "        r_tt"<<i<<"[j + i*Q1d] = 0.0;\n";
11427d8d0e25Snbeams         code << "      }\n";
11437d8d0e25Snbeams         code << "    }\n";
11447d8d0e25Snbeams       } else {
11457d8d0e25Snbeams         code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
11467d8d0e25Snbeams       }
11477d8d0e25Snbeams     }
11487d8d0e25Snbeams     if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
11497d8d0e25Snbeams     {
11507d8d0e25Snbeams       code << "    CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
11517d8d0e25Snbeams     }
11527d8d0e25Snbeams   }
11537d8d0e25Snbeams   // We treat quadrature points per slice in 3d to save registers
11547d8d0e25Snbeams   if (useCollograd) {
11557d8d0e25Snbeams     code << "\n    // Note: Collocated Gradient\n";
11567d8d0e25Snbeams     code << "#pragma unroll\n";
11577d8d0e25Snbeams     code << "    for (CeedInt q=0; q<Q1d; q++) {\n";
11587d8d0e25Snbeams     code << "      // -- Input fields --\n";
11597d8d0e25Snbeams     for (CeedInt i = 0; i < numinputfields; i++) {
11607d8d0e25Snbeams       code << "      // ---- Input field "<<i<<" ----\n";
11617d8d0e25Snbeams       // Get elemsize, emode, ncomp
11627d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
1163e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
11647d8d0e25Snbeams       // Basis action
11657d8d0e25Snbeams       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
11667d8d0e25Snbeams       switch (emode) {
11677d8d0e25Snbeams       case CEED_EVAL_NONE:
11687d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
11697d8d0e25Snbeams 
11707d8d0e25Snbeams         bool isStrided;
1171e15f9bd0SJeremy L Thompson         ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr);
1172e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr);
1173e15f9bd0SJeremy L Thompson         ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
11747d8d0e25Snbeams         if (!isStrided) {
11757d8d0e25Snbeams           ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1176e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11777d8d0e25Snbeams           code << "      const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n";
11787d8d0e25Snbeams           CeedInt compstride;
1179e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
11807d8d0e25Snbeams           code << "      // CompStride: "<<compstride<<"\n";
1181e15f9bd0SJeremy L Thompson           ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
11827d8d0e25Snbeams           data->indices.in[i] = restr_data->d_ind;
11837d8d0e25Snbeams           code << "      readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n";
11847d8d0e25Snbeams         } else {
11857d8d0e25Snbeams           bool backendstrides;
11867d8d0e25Snbeams           ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1187e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11887d8d0e25Snbeams           CeedInt nelem;
11897d8d0e25Snbeams           ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1190e15f9bd0SJeremy L Thompson           CeedChkBackend(ierr);
11917d8d0e25Snbeams           CeedInt strides[3] = {1, elemsize*nelem, elemsize};
11927d8d0e25Snbeams           if (!backendstrides) {
11937d8d0e25Snbeams             ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1194e15f9bd0SJeremy L Thompson             CeedChkBackend(ierr);
11957d8d0e25Snbeams           }
11967d8d0e25Snbeams           code << "      // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
11977d8d0e25Snbeams           code << "      readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n";
11987d8d0e25Snbeams         }
11997d8d0e25Snbeams         break;
12007d8d0e25Snbeams       case CEED_EVAL_INTERP:
12017d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n";
12027d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n";
12037d8d0e25Snbeams         code << "        r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n";
12047d8d0e25Snbeams         code << "      }\n";
12057d8d0e25Snbeams         break;
12067d8d0e25Snbeams       case CEED_EVAL_GRAD:
12077d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n";
12087d8d0e25Snbeams         code << "      gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n";
12097d8d0e25Snbeams         break;
12107d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
12117d8d0e25Snbeams         code << "      CeedScalar r_q"<<i<<"[1];\n";
12127d8d0e25Snbeams         code << "      r_q"<<i<<"[0] = r_t"<<i<<"[q];\n";
12137d8d0e25Snbeams         break; // No action
12147d8d0e25Snbeams       case CEED_EVAL_DIV:
12157d8d0e25Snbeams         break; // TODO: Not implemented
12167d8d0e25Snbeams       case CEED_EVAL_CURL:
12177d8d0e25Snbeams         break; // TODO: Not implemented
12187d8d0e25Snbeams       }
12197d8d0e25Snbeams     }
12207d8d0e25Snbeams     code << "\n      // -- Output fields --\n";
12217d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
12227d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
12237d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1224e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
12257d8d0e25Snbeams       // Basis action
12267d8d0e25Snbeams       switch (emode) {
12277d8d0e25Snbeams       case CEED_EVAL_NONE:
12287d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
12297d8d0e25Snbeams         break; // No action
12307d8d0e25Snbeams       case CEED_EVAL_INTERP:
12317d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n";
12327d8d0e25Snbeams         break;
12337d8d0e25Snbeams       case CEED_EVAL_GRAD:
12347d8d0e25Snbeams         code << "      CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n";
12357d8d0e25Snbeams         break;
12367d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
12377d8d0e25Snbeams         break; // Should not occur
12387d8d0e25Snbeams       case CEED_EVAL_DIV:
12397d8d0e25Snbeams         break; // TODO: Not implemented
12407d8d0e25Snbeams       case CEED_EVAL_CURL:
12417d8d0e25Snbeams         break; // TODO: Not implemented
12427d8d0e25Snbeams       }
12437d8d0e25Snbeams     }
12447d8d0e25Snbeams   } else {
12457d8d0e25Snbeams     code << "\n      // Note: No Collocated Gradient\n";
12467d8d0e25Snbeams     code << "      // -- Input fields --\n";
12477d8d0e25Snbeams     for (CeedInt i = 0; i < numinputfields; i++) {
12487d8d0e25Snbeams       code << "      // ---- Input field "<<i<<" ----\n";
12497d8d0e25Snbeams       code << "      CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n";
12507d8d0e25Snbeams     }
12517d8d0e25Snbeams     code << "      // -- Output fields --\n";
12527d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
12537d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
12547d8d0e25Snbeams       code << "      CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n";
12557d8d0e25Snbeams     }
12567d8d0e25Snbeams   }
12577d8d0e25Snbeams   code << "\n      // -- QFunction Inputs and outputs --\n";
12587d8d0e25Snbeams   code << "      CeedScalar* in["<<numinputfields<<"];\n";
12597d8d0e25Snbeams   for (CeedInt i = 0; i < numinputfields; i++) {
12607d8d0e25Snbeams     code << "      // ---- Input field "<<i<<" ----\n";
12617d8d0e25Snbeams     code << "      in["<<i<<"] = r_q"<<i<<";\n";
12627d8d0e25Snbeams   }
12637d8d0e25Snbeams   code << "      CeedScalar* out["<<numoutputfields<<"];\n";
12647d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
12657d8d0e25Snbeams     code << "      // ---- Output field "<<i<<" ----\n";
12667d8d0e25Snbeams     code << "      out["<<i<<"] = r_qq"<<i<<";\n";
12677d8d0e25Snbeams   }
12687d8d0e25Snbeams   code << "\n      // -- Apply QFunction --\n";
12697d8d0e25Snbeams   code << "      "<<qFunctionName<<"(ctx, ";
12707d8d0e25Snbeams   if (dim != 3 || useCollograd) {
12717d8d0e25Snbeams     code << "1";
12727d8d0e25Snbeams   } else {
12737d8d0e25Snbeams     code << "Q1d";
12747d8d0e25Snbeams   }
12757d8d0e25Snbeams   code << ", in, out);\n";
12767d8d0e25Snbeams   if (useCollograd) {
12777d8d0e25Snbeams     code << "\n      // Note: Collocated Gradient\n";
12787d8d0e25Snbeams     code << "      // -- Output fields --\n";
12797d8d0e25Snbeams     for (CeedInt i = 0; i < numoutputfields; i++) {
12807d8d0e25Snbeams       code << "      // ---- Output field "<<i<<" ----\n";
12817d8d0e25Snbeams       ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1282e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
12837d8d0e25Snbeams       // Basis action
12847d8d0e25Snbeams       code << "      // EvalMode: "<<CeedEvalModes[emode]<<"\n";
12857d8d0e25Snbeams       switch (emode) {
12867d8d0e25Snbeams       case CEED_EVAL_NONE:
12877d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
12887d8d0e25Snbeams         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
12897d8d0e25Snbeams         code << "      }\n";
12907d8d0e25Snbeams         break; // No action
12917d8d0e25Snbeams       case CEED_EVAL_INTERP:
12927d8d0e25Snbeams         code << "      for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n";
12937d8d0e25Snbeams         code << "        r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n";
12947d8d0e25Snbeams         code << "      }\n";
12957d8d0e25Snbeams         break;
12967d8d0e25Snbeams       case CEED_EVAL_GRAD:
12977d8d0e25Snbeams         code << "      gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n";
12987d8d0e25Snbeams         break;
12997d8d0e25Snbeams       case CEED_EVAL_WEIGHT:
13007d8d0e25Snbeams         break; // Should not occur
13017d8d0e25Snbeams       case CEED_EVAL_DIV:
13027d8d0e25Snbeams         break; // TODO: Not implemented
13037d8d0e25Snbeams       case CEED_EVAL_CURL:
13047d8d0e25Snbeams         break; // TODO: Not implemented
13057d8d0e25Snbeams       }
13067d8d0e25Snbeams     }
13077d8d0e25Snbeams     code << "    }\n";
13087d8d0e25Snbeams   }
13097d8d0e25Snbeams 
13107d8d0e25Snbeams   // Output basis apply if needed
13117d8d0e25Snbeams   // Generate the correct eval mode code for each output
13127d8d0e25Snbeams   code << "\n    // -- Output field basis action and restrictions --\n";
13137d8d0e25Snbeams   for (CeedInt i = 0; i < numoutputfields; i++) {
13147d8d0e25Snbeams     code << "    // ---- Output field "<<i<<" ----\n";
13157d8d0e25Snbeams     // Get elemsize, emode, ncomp
13167d8d0e25Snbeams     ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
1317e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13187d8d0e25Snbeams     ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
1319e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13207d8d0e25Snbeams     ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
1321e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13227d8d0e25Snbeams     ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp);
1323e15f9bd0SJeremy L Thompson     CeedChkBackend(ierr);
13247d8d0e25Snbeams     // Basis action
13257d8d0e25Snbeams     code << "    // EvalMode: "<<CeedEvalModes[emode]<<"\n";
13267d8d0e25Snbeams     switch (emode) {
13277d8d0e25Snbeams     case CEED_EVAL_NONE:
13287d8d0e25Snbeams       code << "    CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n";
13297d8d0e25Snbeams       break; // No action
13307d8d0e25Snbeams     case CEED_EVAL_INTERP:
13317d8d0e25Snbeams       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
13327d8d0e25Snbeams       code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
13337d8d0e25Snbeams       break;
13347d8d0e25Snbeams     case CEED_EVAL_GRAD:
13357d8d0e25Snbeams       code << "    CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
13367d8d0e25Snbeams       if (useCollograd) {
13377d8d0e25Snbeams         code << "    interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
13387d8d0e25Snbeams       } else {
13397d8d0e25Snbeams         code << "    gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
13407d8d0e25Snbeams       }
13417d8d0e25Snbeams       break;
13427d8d0e25Snbeams     // LCOV_EXCL_START
13437d8d0e25Snbeams     case CEED_EVAL_WEIGHT: {
13447d8d0e25Snbeams       Ceed ceed;
1345e15f9bd0SJeremy L Thompson       ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr);
1346e15f9bd0SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
13477d8d0e25Snbeams                        "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
13487d8d0e25Snbeams       break; // Should not occur
13497d8d0e25Snbeams     }
13507d8d0e25Snbeams     case CEED_EVAL_DIV:
13517d8d0e25Snbeams       break; // TODO: Not implemented
13527d8d0e25Snbeams     case CEED_EVAL_CURL:
13537d8d0e25Snbeams       break; // TODO: Not implemented
13547d8d0e25Snbeams       // LCOV_EXCL_STOP
13557d8d0e25Snbeams     }
13567d8d0e25Snbeams     // Restriction
13577d8d0e25Snbeams       bool isStrided;
1358e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr);
13597d8d0e25Snbeams     if (!isStrided) {
13607d8d0e25Snbeams       ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize);
1361e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13627d8d0e25Snbeams       code << "    const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n";
13637d8d0e25Snbeams       CeedInt compstride;
1364e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr);
13657d8d0e25Snbeams       code << "    // CompStride: "<<compstride<<"\n";
1366e15f9bd0SJeremy L Thompson       ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr);
13677d8d0e25Snbeams       data->indices.out[i] = restr_data->d_ind;
13687d8d0e25Snbeams       code << "    writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
13697d8d0e25Snbeams     } else {
13707d8d0e25Snbeams       bool backendstrides;
13717d8d0e25Snbeams       ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides);
1372e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13737d8d0e25Snbeams       CeedInt nelem;
13747d8d0e25Snbeams       ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem);
1375e15f9bd0SJeremy L Thompson       CeedChkBackend(ierr);
13767d8d0e25Snbeams       CeedInt strides[3] = {1, elemsize*nelem, elemsize};
13777d8d0e25Snbeams       if (!backendstrides) {
13787d8d0e25Snbeams         ierr = CeedElemRestrictionGetStrides(Erestrict, &strides);
1379e15f9bd0SJeremy L Thompson         CeedChkBackend(ierr);
13807d8d0e25Snbeams       }
13817d8d0e25Snbeams       code << "    // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n";
13827d8d0e25Snbeams       code << "    writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n";
13837d8d0e25Snbeams     }
13847d8d0e25Snbeams   }
13857d8d0e25Snbeams 
13867d8d0e25Snbeams   code << "  }\n";
13877d8d0e25Snbeams   code << "}\n";
13887d8d0e25Snbeams   code << "// -----------------------------------------------------------------------------\n\n";
13897d8d0e25Snbeams 
13907d8d0e25Snbeams   // View kernel for debugging
139146dc0734SJeremy L Thompson   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
13923f21f6b1SJeremy L Thompson   CeedDebug(ceed, code.str().c_str());
13937d8d0e25Snbeams 
1394*539ec17dSJeremy L Thompson   CeedInt block_sizes[3] = {0, 0, 0};
139537c3b1cfSnbeams   ierr = BlockGridCalculate_Hip_gen(dim, numelements, data->maxP1d, Q1d, block_sizes);
1396b3e1519bSnbeams   CeedChkBackend(ierr);
1397b3e1519bSnbeams   ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2,
1398b3e1519bSnbeams                          "T1d", block_sizes[0],
1399b3e1519bSnbeams                          "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]);
1400e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
14017d8d0e25Snbeams   ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op);
1402e15f9bd0SJeremy L Thompson   CeedChkBackend(ierr);
14037d8d0e25Snbeams 
1404e15f9bd0SJeremy L Thompson   ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr);
1405e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
14067d8d0e25Snbeams }
14077d8d0e25Snbeams //------------------------------------------------------------------------------
1408