17d8d0e25Snbeams // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 27d8d0e25Snbeams // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 37d8d0e25Snbeams // All Rights reserved. See files LICENSE and NOTICE for details. 47d8d0e25Snbeams // 57d8d0e25Snbeams // This file is part of CEED, a collection of benchmarks, miniapps, software 67d8d0e25Snbeams // libraries and APIs for efficient high-order finite element and spectral 77d8d0e25Snbeams // element discretizations for exascale applications. For more information and 87d8d0e25Snbeams // source code availability see http://github.com/ceed. 97d8d0e25Snbeams // 107d8d0e25Snbeams // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 117d8d0e25Snbeams // a collaborative effort of two U.S. Department of Energy organizations (Office 127d8d0e25Snbeams // of Science and the National Nuclear Security Administration) responsible for 137d8d0e25Snbeams // the planning and preparation of a capable exascale ecosystem, including 147d8d0e25Snbeams // software, applications, hardware, advanced system engineering and early 157d8d0e25Snbeams // testbed platforms, in support of the nation's exascale computing imperative. 163d576824SJeremy L Thompson 177d8d0e25Snbeams #define CEED_DEBUG_COLOR 12 187d8d0e25Snbeams 19ec3da8bcSJed Brown #include <ceed/ceed.h> 20ec3da8bcSJed Brown #include <ceed/backend.h> 217d8d0e25Snbeams #include <iostream> 223d576824SJeremy L Thompson #include <string> 237d8d0e25Snbeams #include <sstream> 243d576824SJeremy L Thompson #include "ceed-hip-gen.h" 250d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h" 267d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h" 277d8d0e25Snbeams #include "../hip/ceed-hip-compile.h" 287d8d0e25Snbeams 29b3e1519bSnbeams 30b3e1519bSnbeams //------------------------------------------------------------------------------ 31b3e1519bSnbeams // Calculate the block size used for launching the operator kernel 32b3e1519bSnbeams //------------------------------------------------------------------------------ 3337c3b1cfSnbeams extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt nelem, 34b3e1519bSnbeams const CeedInt P1d, const CeedInt Q1d, 35b3e1519bSnbeams CeedInt *block_sizes) { 36b3e1519bSnbeams 37b3e1519bSnbeams const CeedInt thread1d = CeedIntMax(Q1d, P1d); 38b3e1519bSnbeams if (dim==1) { 39b3e1519bSnbeams CeedInt elemsPerBlock = 64*thread1d > 256? 256/thread1d : 64; 40b3e1519bSnbeams elemsPerBlock = elemsPerBlock>0?elemsPerBlock:1; 41b3e1519bSnbeams block_sizes[0] = thread1d; 42b3e1519bSnbeams block_sizes[1] = 1; 43b3e1519bSnbeams block_sizes[2] = elemsPerBlock; 44b3e1519bSnbeams } else if (dim==2) { 45b3e1519bSnbeams const CeedInt elemsPerBlock = thread1d<4? 16 : 2; 46b3e1519bSnbeams block_sizes[0] = thread1d; 47b3e1519bSnbeams block_sizes[1] = thread1d; 48b3e1519bSnbeams block_sizes[2] = elemsPerBlock; 49b3e1519bSnbeams } else if (dim==3) { 50b3e1519bSnbeams const CeedInt elemsPerBlock = thread1d<6? 4 : (thread1d<8? 2 : 1); 51b3e1519bSnbeams block_sizes[0] = thread1d; 52b3e1519bSnbeams block_sizes[1] = thread1d; 53b3e1519bSnbeams block_sizes[2] = elemsPerBlock; 54b3e1519bSnbeams } 55b3e1519bSnbeams return CEED_ERROR_SUCCESS; 56b3e1519bSnbeams } 57b3e1519bSnbeams 587d8d0e25Snbeams static const char *deviceFunctions = QUOTE( 597d8d0e25Snbeams 607d8d0e25Snbeams //------------------------------------------------------------------------------ 617d8d0e25Snbeams // Typedefs 627d8d0e25Snbeams //------------------------------------------------------------------------------ 637d8d0e25Snbeams typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields; 647d8d0e25Snbeams typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt; 657d8d0e25Snbeams 667d8d0e25Snbeams typedef struct { 677d8d0e25Snbeams CeedInt tidx; 687d8d0e25Snbeams CeedInt tidy; 697d8d0e25Snbeams CeedInt tidz; 707d8d0e25Snbeams CeedInt tid; 717d8d0e25Snbeams CeedScalar* slice; 727d8d0e25Snbeams } BackendData; 737d8d0e25Snbeams 747d8d0e25Snbeams //------------------------------------------------------------------------------ 757d8d0e25Snbeams // Load matrices for basis actions 767d8d0e25Snbeams //------------------------------------------------------------------------------ 777d8d0e25Snbeams template <int P, int Q> 787d8d0e25Snbeams inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 797d8d0e25Snbeams for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 807d8d0e25Snbeams B[i] = d_B[i]; 817d8d0e25Snbeams } 827d8d0e25Snbeams 837d8d0e25Snbeams //------------------------------------------------------------------------------ 847d8d0e25Snbeams // 1D 857d8d0e25Snbeams //------------------------------------------------------------------------------ 867d8d0e25Snbeams 877d8d0e25Snbeams //------------------------------------------------------------------------------ 887d8d0e25Snbeams // L-vector -> E-vector, offsets provided 897d8d0e25Snbeams //------------------------------------------------------------------------------ 907d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 917d8d0e25Snbeams inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 927d8d0e25Snbeams if (data.tidx < P1d) { 937d8d0e25Snbeams const CeedInt node = data.tidx; 947d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d]; 957d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 967d8d0e25Snbeams r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 977d8d0e25Snbeams } 987d8d0e25Snbeams } 997d8d0e25Snbeams 1007d8d0e25Snbeams //------------------------------------------------------------------------------ 1017d8d0e25Snbeams // L-vector -> E-vector, strided 1027d8d0e25Snbeams //------------------------------------------------------------------------------ 1037d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 1047d8d0e25Snbeams inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 1057d8d0e25Snbeams if (data.tidx < P1d) { 1067d8d0e25Snbeams const CeedInt node = data.tidx; 1077d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 1087d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 1097d8d0e25Snbeams r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 1107d8d0e25Snbeams } 1117d8d0e25Snbeams } 1127d8d0e25Snbeams 1137d8d0e25Snbeams //------------------------------------------------------------------------------ 1147d8d0e25Snbeams // E-vector -> L-vector, offsets provided 1157d8d0e25Snbeams //------------------------------------------------------------------------------ 1167d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 1177d8d0e25Snbeams inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 1187d8d0e25Snbeams if (data.tidx < P1d) { 1197d8d0e25Snbeams const CeedInt node = data.tidx; 1207d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d]; 1217d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 1227d8d0e25Snbeams atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 1237d8d0e25Snbeams } 1247d8d0e25Snbeams } 1257d8d0e25Snbeams 1267d8d0e25Snbeams //------------------------------------------------------------------------------ 1277d8d0e25Snbeams // E-vector -> L-vector, strided 1287d8d0e25Snbeams //------------------------------------------------------------------------------ 1297d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 1307d8d0e25Snbeams inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 1317d8d0e25Snbeams if (data.tidx < P1d) { 1327d8d0e25Snbeams const CeedInt node = data.tidx; 1337d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 1347d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 1357d8d0e25Snbeams d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 1367d8d0e25Snbeams } 1377d8d0e25Snbeams } 1387d8d0e25Snbeams 1397d8d0e25Snbeams //------------------------------------------------------------------------------ 1407d8d0e25Snbeams // 1D tensor contraction x 1417d8d0e25Snbeams //------------------------------------------------------------------------------ 1427d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 1437d8d0e25Snbeams inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 1447d8d0e25Snbeams data.slice[data.tidx] = *U; 1457d8d0e25Snbeams __syncthreads(); 1467d8d0e25Snbeams *V = 0.0; 1477d8d0e25Snbeams if (data.tidx < Q1d) 1487d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 1497d8d0e25Snbeams *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 1507d8d0e25Snbeams __syncthreads(); 1517d8d0e25Snbeams } 1527d8d0e25Snbeams 1537d8d0e25Snbeams //------------------------------------------------------------------------------ 1547d8d0e25Snbeams // 1D transpose tensor contraction x 1557d8d0e25Snbeams //------------------------------------------------------------------------------ 1567d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 1577d8d0e25Snbeams inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 1587d8d0e25Snbeams data.slice[data.tidx] = *U; 1597d8d0e25Snbeams __syncthreads(); 1607d8d0e25Snbeams *V = 0.0; 1617d8d0e25Snbeams if (data.tidx < P1d) 1627d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 1637d8d0e25Snbeams *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 1647d8d0e25Snbeams __syncthreads(); 1657d8d0e25Snbeams } 1667d8d0e25Snbeams 1677d8d0e25Snbeams //------------------------------------------------------------------------------ 1687d8d0e25Snbeams // 1D interpolate to quadrature points 1697d8d0e25Snbeams //------------------------------------------------------------------------------ 1707d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 1717d8d0e25Snbeams inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 1727d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) 1737d8d0e25Snbeams ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 1747d8d0e25Snbeams } 1757d8d0e25Snbeams 1767d8d0e25Snbeams //------------------------------------------------------------------------------ 1777d8d0e25Snbeams // 1D interpolate transpose 1787d8d0e25Snbeams //------------------------------------------------------------------------------ 1797d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 1807d8d0e25Snbeams inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 1817d8d0e25Snbeams for (CeedInt comp=0; comp<NCOMP; comp++) 1827d8d0e25Snbeams ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 1837d8d0e25Snbeams } 1847d8d0e25Snbeams 1857d8d0e25Snbeams //------------------------------------------------------------------------------ 1867d8d0e25Snbeams // 1D derivatives at quadrature points 1877d8d0e25Snbeams //------------------------------------------------------------------------------ 1887d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 1897d8d0e25Snbeams inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 1907d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) 1917d8d0e25Snbeams ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 1927d8d0e25Snbeams } 1937d8d0e25Snbeams 1947d8d0e25Snbeams //------------------------------------------------------------------------------ 1957d8d0e25Snbeams // 1D derivatives transpose 1967d8d0e25Snbeams //------------------------------------------------------------------------------ 1977d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 1987d8d0e25Snbeams inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 1997d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) 2007d8d0e25Snbeams ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 2017d8d0e25Snbeams } 2027d8d0e25Snbeams 2037d8d0e25Snbeams //------------------------------------------------------------------------------ 2047d8d0e25Snbeams // 2D 2057d8d0e25Snbeams //------------------------------------------------------------------------------ 2067d8d0e25Snbeams 2077d8d0e25Snbeams //------------------------------------------------------------------------------ 2087d8d0e25Snbeams // L-vector -> E-vector, offsets provided 2097d8d0e25Snbeams //------------------------------------------------------------------------------ 2107d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 2117d8d0e25Snbeams inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 2127d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) { 2137d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d; 2147d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d*P1d]; 2157d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 2167d8d0e25Snbeams r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 2177d8d0e25Snbeams } 2187d8d0e25Snbeams } 2197d8d0e25Snbeams 2207d8d0e25Snbeams //------------------------------------------------------------------------------ 2217d8d0e25Snbeams // L-vector -> E-vector, strided 2227d8d0e25Snbeams //------------------------------------------------------------------------------ 2237d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 2247d8d0e25Snbeams inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 2257d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) { 2267d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d; 2277d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 2287d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 2297d8d0e25Snbeams r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 2307d8d0e25Snbeams } 2317d8d0e25Snbeams } 2327d8d0e25Snbeams 2337d8d0e25Snbeams //------------------------------------------------------------------------------ 2347d8d0e25Snbeams // E-vector -> L-vector, offsets provided 2357d8d0e25Snbeams //------------------------------------------------------------------------------ 2367d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 2377d8d0e25Snbeams inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 2387d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) { 2397d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d; 2407d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d*P1d]; 2417d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 2427d8d0e25Snbeams atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 2437d8d0e25Snbeams } 2447d8d0e25Snbeams } 2457d8d0e25Snbeams 2467d8d0e25Snbeams //------------------------------------------------------------------------------ 2477d8d0e25Snbeams // E-vector -> L-vector, strided 2487d8d0e25Snbeams //------------------------------------------------------------------------------ 2497d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 2507d8d0e25Snbeams inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 2517d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) { 2527d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d; 2537d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 2547d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 2557d8d0e25Snbeams d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 2567d8d0e25Snbeams } 2577d8d0e25Snbeams } 2587d8d0e25Snbeams 2597d8d0e25Snbeams //------------------------------------------------------------------------------ 2607d8d0e25Snbeams // 2D tensor contraction x 2617d8d0e25Snbeams //------------------------------------------------------------------------------ 2627d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 2637d8d0e25Snbeams inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 2647d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = *U; 2657d8d0e25Snbeams __syncthreads(); 2667d8d0e25Snbeams *V = 0.0; 2677d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < P1d) 2687d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 2697d8d0e25Snbeams *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 2707d8d0e25Snbeams __syncthreads(); 2717d8d0e25Snbeams } 2727d8d0e25Snbeams 2737d8d0e25Snbeams //------------------------------------------------------------------------------ 2747d8d0e25Snbeams // 2D tensor contract y 2757d8d0e25Snbeams //------------------------------------------------------------------------------ 2767d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 2777d8d0e25Snbeams inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 2787d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = *U; 2797d8d0e25Snbeams __syncthreads(); 2807d8d0e25Snbeams *V = 0.0; 2817d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) 2827d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 2837d8d0e25Snbeams *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 2847d8d0e25Snbeams __syncthreads(); 2857d8d0e25Snbeams } 2867d8d0e25Snbeams 2877d8d0e25Snbeams //------------------------------------------------------------------------------ 2887d8d0e25Snbeams // 2D transpose tensor contract y 2897d8d0e25Snbeams //------------------------------------------------------------------------------ 2907d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 2917d8d0e25Snbeams inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 2927d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = *U; 2937d8d0e25Snbeams __syncthreads(); 2947d8d0e25Snbeams *V = 0.0; 2957d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < P1d) 2967d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 2977d8d0e25Snbeams *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 2987d8d0e25Snbeams __syncthreads(); 2997d8d0e25Snbeams } 3007d8d0e25Snbeams 3017d8d0e25Snbeams //------------------------------------------------------------------------------ 3027d8d0e25Snbeams // 2D transpose tensor contract x 3037d8d0e25Snbeams //------------------------------------------------------------------------------ 3047d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 3057d8d0e25Snbeams inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 3067d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = *U; 3077d8d0e25Snbeams __syncthreads(); 3087d8d0e25Snbeams *V = 0.0; 3097d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 3107d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 3117d8d0e25Snbeams *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 3127d8d0e25Snbeams __syncthreads(); 3137d8d0e25Snbeams } 3147d8d0e25Snbeams 3157d8d0e25Snbeams //------------------------------------------------------------------------------ 3167d8d0e25Snbeams // 2D transpose tensor contract and add x 3177d8d0e25Snbeams //------------------------------------------------------------------------------ 3187d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 3197d8d0e25Snbeams inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 3207d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = *U; 3217d8d0e25Snbeams __syncthreads(); 3227d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 3237d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 3247d8d0e25Snbeams *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 3257d8d0e25Snbeams __syncthreads(); 3267d8d0e25Snbeams } 3277d8d0e25Snbeams 3287d8d0e25Snbeams //------------------------------------------------------------------------------ 3297d8d0e25Snbeams // 2D interpolate to quadrature points 3307d8d0e25Snbeams //------------------------------------------------------------------------------ 3317d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 3327d8d0e25Snbeams inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 3337d8d0e25Snbeams CeedScalar r_t[1]; 3347d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 3357d8d0e25Snbeams ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 3367d8d0e25Snbeams ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 3377d8d0e25Snbeams } 3387d8d0e25Snbeams } 3397d8d0e25Snbeams 3407d8d0e25Snbeams //------------------------------------------------------------------------------ 3417d8d0e25Snbeams // 2D interpolate transpose 3427d8d0e25Snbeams //------------------------------------------------------------------------------ 3437d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 3447d8d0e25Snbeams inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 3457d8d0e25Snbeams CeedScalar r_t[1]; 3467d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 3477d8d0e25Snbeams ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 3487d8d0e25Snbeams ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 3497d8d0e25Snbeams } 3507d8d0e25Snbeams } 3517d8d0e25Snbeams 3527d8d0e25Snbeams //------------------------------------------------------------------------------ 3537d8d0e25Snbeams // 2D derivatives at quadrature points 3547d8d0e25Snbeams //------------------------------------------------------------------------------ 3557d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 3567d8d0e25Snbeams inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 3577d8d0e25Snbeams CeedScalar r_t[1]; 3587d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 3597d8d0e25Snbeams ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 3607d8d0e25Snbeams ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 3617d8d0e25Snbeams ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 3627d8d0e25Snbeams ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 3637d8d0e25Snbeams } 3647d8d0e25Snbeams } 3657d8d0e25Snbeams 3667d8d0e25Snbeams //------------------------------------------------------------------------------ 3677d8d0e25Snbeams // 2D derivatives transpose 3687d8d0e25Snbeams //------------------------------------------------------------------------------ 3697d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 3707d8d0e25Snbeams inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 3717d8d0e25Snbeams CeedScalar r_t[1]; 3727d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 3737d8d0e25Snbeams ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 3747d8d0e25Snbeams ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 3757d8d0e25Snbeams ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 3767d8d0e25Snbeams ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 3777d8d0e25Snbeams } 3787d8d0e25Snbeams } 3797d8d0e25Snbeams 3807d8d0e25Snbeams //------------------------------------------------------------------------------ 3817d8d0e25Snbeams // 3D 3827d8d0e25Snbeams //------------------------------------------------------------------------------ 3837d8d0e25Snbeams 3847d8d0e25Snbeams //------------------------------------------------------------------------------ 3857d8d0e25Snbeams // L-vector -> E-vector, offsets provided 3867d8d0e25Snbeams //------------------------------------------------------------------------------ 3877d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 3887d8d0e25Snbeams inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 3897d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 3907d8d0e25Snbeams for (CeedInt z = 0; z < P1d; ++z) { 3917d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 3927d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 3937d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 3947d8d0e25Snbeams r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 3957d8d0e25Snbeams } 3967d8d0e25Snbeams } 3977d8d0e25Snbeams 3987d8d0e25Snbeams //------------------------------------------------------------------------------ 3997d8d0e25Snbeams // L-vector -> E-vector, strided 4007d8d0e25Snbeams //------------------------------------------------------------------------------ 4017d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4027d8d0e25Snbeams inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 4037d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 4047d8d0e25Snbeams for (CeedInt z = 0; z < P1d; ++z) { 4057d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 4067d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 4077d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 4087d8d0e25Snbeams r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 4097d8d0e25Snbeams } 4107d8d0e25Snbeams } 4117d8d0e25Snbeams 4127d8d0e25Snbeams //------------------------------------------------------------------------------ 4137d8d0e25Snbeams // E-vector -> Q-vector, offests provided 4147d8d0e25Snbeams //------------------------------------------------------------------------------ 4157d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int Q1d> 4167d8d0e25Snbeams inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 4177d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) { 4187d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 4197d8d0e25Snbeams const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 4207d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 4217d8d0e25Snbeams r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 4227d8d0e25Snbeams } 4237d8d0e25Snbeams } 4247d8d0e25Snbeams 4257d8d0e25Snbeams //------------------------------------------------------------------------------ 4267d8d0e25Snbeams // E-vector -> Q-vector, strided 4277d8d0e25Snbeams //------------------------------------------------------------------------------ 4287d8d0e25Snbeams template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4297d8d0e25Snbeams inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 4307d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) { 4317d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 4327d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 4337d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 4347d8d0e25Snbeams r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 4357d8d0e25Snbeams } 4367d8d0e25Snbeams } 4377d8d0e25Snbeams 4387d8d0e25Snbeams //------------------------------------------------------------------------------ 4397d8d0e25Snbeams // E-vector -> L-vector, offsets provided 4407d8d0e25Snbeams //------------------------------------------------------------------------------ 4417d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 4427d8d0e25Snbeams inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 4437d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 4447d8d0e25Snbeams for (CeedInt z = 0; z < P1d; ++z) { 4457d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 4467d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 4477d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 4487d8d0e25Snbeams atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 4497d8d0e25Snbeams } 4507d8d0e25Snbeams } 4517d8d0e25Snbeams 4527d8d0e25Snbeams //------------------------------------------------------------------------------ 4537d8d0e25Snbeams // E-vector -> L-vector, strided 4547d8d0e25Snbeams //------------------------------------------------------------------------------ 4557d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4567d8d0e25Snbeams inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 4577d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 4587d8d0e25Snbeams for (CeedInt z = 0; z < P1d; ++z) { 4597d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 4607d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 4617d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 4627d8d0e25Snbeams d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 4637d8d0e25Snbeams } 4647d8d0e25Snbeams } 4657d8d0e25Snbeams 4667d8d0e25Snbeams //------------------------------------------------------------------------------ 4677d8d0e25Snbeams // 3D tensor contract x 4687d8d0e25Snbeams //------------------------------------------------------------------------------ 4697d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 4707d8d0e25Snbeams inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 4717d8d0e25Snbeams CeedScalar r_B[P1d]; 4727d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 4737d8d0e25Snbeams r_B[i] = B[i + data.tidx*P1d]; 4747d8d0e25Snbeams 4757d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 4767d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 4777d8d0e25Snbeams __syncthreads(); 4787d8d0e25Snbeams V[k] = 0.0; 4797d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < P1d) 4807d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 4817d8d0e25Snbeams V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 4827d8d0e25Snbeams __syncthreads(); 4837d8d0e25Snbeams } 4847d8d0e25Snbeams } 4857d8d0e25Snbeams 4867d8d0e25Snbeams //------------------------------------------------------------------------------ 4877d8d0e25Snbeams // 3D tensor contract y 4887d8d0e25Snbeams //------------------------------------------------------------------------------ 4897d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 4907d8d0e25Snbeams inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 4917d8d0e25Snbeams CeedScalar r_B[P1d]; 4927d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 4937d8d0e25Snbeams r_B[i] = B[i + data.tidy*P1d]; 4947d8d0e25Snbeams 4957d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 4967d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 4977d8d0e25Snbeams __syncthreads(); 4987d8d0e25Snbeams V[k] = 0.0; 4997d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) 5007d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 5017d8d0e25Snbeams V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 5027d8d0e25Snbeams __syncthreads(); 5037d8d0e25Snbeams } 5047d8d0e25Snbeams } 5057d8d0e25Snbeams 5067d8d0e25Snbeams //------------------------------------------------------------------------------ 5077d8d0e25Snbeams // 3D tensor contract z 5087d8d0e25Snbeams //------------------------------------------------------------------------------ 5097d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 5107d8d0e25Snbeams inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 5117d8d0e25Snbeams for (CeedInt k = 0; k < Q1d; ++k) { 5127d8d0e25Snbeams V[k] = 0.0; 5137d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) 5147d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 5157d8d0e25Snbeams V[k] += B[i + k*P1d] * U[i]; // Contract z direction 5167d8d0e25Snbeams } 5177d8d0e25Snbeams } 5187d8d0e25Snbeams 5197d8d0e25Snbeams //------------------------------------------------------------------------------ 5207d8d0e25Snbeams // 3D transpose tensor contract z 5217d8d0e25Snbeams //------------------------------------------------------------------------------ 5227d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 5237d8d0e25Snbeams inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 5247d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 5257d8d0e25Snbeams V[k] = 0.0; 5267d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) 5277d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 5287d8d0e25Snbeams V[k] += B[k + i*P1d] * U[i]; // Contract z direction 5297d8d0e25Snbeams } 5307d8d0e25Snbeams } 5317d8d0e25Snbeams 5327d8d0e25Snbeams //------------------------------------------------------------------------------ 5337d8d0e25Snbeams // 3D transpose tensor contract y 5347d8d0e25Snbeams //------------------------------------------------------------------------------ 5357d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 5367d8d0e25Snbeams inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 5377d8d0e25Snbeams CeedScalar r_B[Q1d]; 5387d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 5397d8d0e25Snbeams r_B[i] = B[data.tidy + i*P1d]; 5407d8d0e25Snbeams 5417d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 5427d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 5437d8d0e25Snbeams __syncthreads(); 5447d8d0e25Snbeams V[k] = 0.0; 5457d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < P1d) 5467d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 5477d8d0e25Snbeams V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 5487d8d0e25Snbeams __syncthreads(); 5497d8d0e25Snbeams } 5507d8d0e25Snbeams } 5517d8d0e25Snbeams 5527d8d0e25Snbeams //------------------------------------------------------------------------------ 5537d8d0e25Snbeams // 3D transpose tensor contract add y 5547d8d0e25Snbeams //------------------------------------------------------------------------------ 5557d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 5567d8d0e25Snbeams inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 5577d8d0e25Snbeams CeedScalar r_B[Q1d]; 5587d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 5597d8d0e25Snbeams r_B[i] = B[data.tidy + i*P1d]; 5607d8d0e25Snbeams 5617d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 5627d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 5637d8d0e25Snbeams __syncthreads(); 5647d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < P1d) 5657d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 5667d8d0e25Snbeams V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 5677d8d0e25Snbeams __syncthreads(); 5687d8d0e25Snbeams } 5697d8d0e25Snbeams } 5707d8d0e25Snbeams 5717d8d0e25Snbeams //------------------------------------------------------------------------------ 5727d8d0e25Snbeams // 3D transpose tensor contract x 5737d8d0e25Snbeams //------------------------------------------------------------------------------ 5747d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 5757d8d0e25Snbeams inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 5767d8d0e25Snbeams CeedScalar r_B[Q1d]; 5777d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 5787d8d0e25Snbeams r_B[i] = B[data.tidx + i*P1d]; 5797d8d0e25Snbeams 5807d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 5817d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 5827d8d0e25Snbeams __syncthreads(); 5837d8d0e25Snbeams V[k] = 0.0; 5847d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 5857d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 5867d8d0e25Snbeams V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 5877d8d0e25Snbeams __syncthreads(); 5887d8d0e25Snbeams } 5897d8d0e25Snbeams } 5907d8d0e25Snbeams 5917d8d0e25Snbeams //------------------------------------------------------------------------------ 5927d8d0e25Snbeams // 3D transpose tensor contract add x 5937d8d0e25Snbeams //------------------------------------------------------------------------------ 5947d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 5957d8d0e25Snbeams inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 5967d8d0e25Snbeams CeedScalar r_B[Q1d]; 5977d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 5987d8d0e25Snbeams r_B[i] = B[data.tidx + i*P1d]; 5997d8d0e25Snbeams 6007d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 6017d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 6027d8d0e25Snbeams __syncthreads(); 6037d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 6047d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 6057d8d0e25Snbeams V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 6067d8d0e25Snbeams __syncthreads(); 6077d8d0e25Snbeams } 6087d8d0e25Snbeams } 6097d8d0e25Snbeams 6107d8d0e25Snbeams //------------------------------------------------------------------------------ 6117d8d0e25Snbeams // 3D interpolate to quadrature points 6127d8d0e25Snbeams //------------------------------------------------------------------------------ 6137d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 6147d8d0e25Snbeams inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 6157d8d0e25Snbeams CeedScalar r_t1[T1d]; 6167d8d0e25Snbeams CeedScalar r_t2[T1d]; 6177d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 6187d8d0e25Snbeams ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 6197d8d0e25Snbeams ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 6207d8d0e25Snbeams ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 6217d8d0e25Snbeams } 6227d8d0e25Snbeams } 6237d8d0e25Snbeams 6247d8d0e25Snbeams //------------------------------------------------------------------------------ 6257d8d0e25Snbeams // 3D interpolate transpose 6267d8d0e25Snbeams //------------------------------------------------------------------------------ 6277d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 6287d8d0e25Snbeams inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 6297d8d0e25Snbeams CeedScalar r_t1[T1d]; 6307d8d0e25Snbeams CeedScalar r_t2[T1d]; 6317d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 6327d8d0e25Snbeams ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 6337d8d0e25Snbeams ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 6347d8d0e25Snbeams ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 6357d8d0e25Snbeams } 6367d8d0e25Snbeams } 6377d8d0e25Snbeams 6387d8d0e25Snbeams //------------------------------------------------------------------------------ 6397d8d0e25Snbeams // 3D derivatives at quadrature points 6407d8d0e25Snbeams //------------------------------------------------------------------------------ 6417d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 6427d8d0e25Snbeams inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 6437d8d0e25Snbeams CeedScalar r_t1[T1d]; 6447d8d0e25Snbeams CeedScalar r_t2[T1d]; 6457d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 6467d8d0e25Snbeams ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 6477d8d0e25Snbeams ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 6487d8d0e25Snbeams ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 6497d8d0e25Snbeams ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 6507d8d0e25Snbeams ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 6517d8d0e25Snbeams ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 6527d8d0e25Snbeams ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 6537d8d0e25Snbeams ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 6547d8d0e25Snbeams ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 6557d8d0e25Snbeams } 6567d8d0e25Snbeams } 6577d8d0e25Snbeams 6587d8d0e25Snbeams //------------------------------------------------------------------------------ 6597d8d0e25Snbeams // 3D derivatives transpose 6607d8d0e25Snbeams //------------------------------------------------------------------------------ 6617d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 6627d8d0e25Snbeams inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 6637d8d0e25Snbeams CeedScalar r_t1[T1d]; 6647d8d0e25Snbeams CeedScalar r_t2[T1d]; 6657d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 6667d8d0e25Snbeams ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 6677d8d0e25Snbeams ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 6687d8d0e25Snbeams ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 6697d8d0e25Snbeams ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 6707d8d0e25Snbeams ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 6717d8d0e25Snbeams ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 6727d8d0e25Snbeams ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 6737d8d0e25Snbeams ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 6747d8d0e25Snbeams ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 6757d8d0e25Snbeams } 6767d8d0e25Snbeams } 6777d8d0e25Snbeams 6787d8d0e25Snbeams //------------------------------------------------------------------------------ 6797d8d0e25Snbeams // 3D collocated derivatives computation 6807d8d0e25Snbeams //------------------------------------------------------------------------------ 6817d8d0e25Snbeams template <int NCOMP, int Q1d> 6827d8d0e25Snbeams inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 6837d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) { 6847d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) { 6857d8d0e25Snbeams data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 6867d8d0e25Snbeams __syncthreads(); 6877d8d0e25Snbeams // X derivative 6887d8d0e25Snbeams r_V[comp+0*NCOMP] = 0.0; 6897d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 6907d8d0e25Snbeams r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 6917d8d0e25Snbeams // Y derivative 6927d8d0e25Snbeams r_V[comp+1*NCOMP] = 0.0; 6937d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 6947d8d0e25Snbeams r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 6957d8d0e25Snbeams // Z derivative 6967d8d0e25Snbeams r_V[comp+2*NCOMP] = 0.0; 6977d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 6987d8d0e25Snbeams r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 6997d8d0e25Snbeams __syncthreads(); 7007d8d0e25Snbeams } 7017d8d0e25Snbeams } 7027d8d0e25Snbeams } 7037d8d0e25Snbeams 7047d8d0e25Snbeams //------------------------------------------------------------------------------ 7057d8d0e25Snbeams // 3D collocated derivatives transpose 7067d8d0e25Snbeams //------------------------------------------------------------------------------ 7077d8d0e25Snbeams template <int NCOMP, int Q1d> 7087d8d0e25Snbeams inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 7097d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) { 7107d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) { 7117d8d0e25Snbeams // X derivative 7127d8d0e25Snbeams data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 7137d8d0e25Snbeams __syncthreads(); 7147d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 7157d8d0e25Snbeams r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 7167d8d0e25Snbeams __syncthreads(); 7177d8d0e25Snbeams // Y derivative 7187d8d0e25Snbeams data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 7197d8d0e25Snbeams __syncthreads(); 7207d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 7217d8d0e25Snbeams r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 7227d8d0e25Snbeams __syncthreads(); 7237d8d0e25Snbeams // Z derivative 7247d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 7257d8d0e25Snbeams r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 7267d8d0e25Snbeams } 7277d8d0e25Snbeams } 7287d8d0e25Snbeams } 7297d8d0e25Snbeams 7307d8d0e25Snbeams //------------------------------------------------------------------------------ 7317d8d0e25Snbeams // 1D quadrature weights 7327d8d0e25Snbeams //------------------------------------------------------------------------------ 7337d8d0e25Snbeams template <int Q1d> 7347d8d0e25Snbeams inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 7357d8d0e25Snbeams *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 7367d8d0e25Snbeams } 7377d8d0e25Snbeams 7387d8d0e25Snbeams //------------------------------------------------------------------------------ 7397d8d0e25Snbeams // 2D quadrature weights 7407d8d0e25Snbeams //------------------------------------------------------------------------------ 7417d8d0e25Snbeams template <int Q1d> 7427d8d0e25Snbeams inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 7437d8d0e25Snbeams *w = (data.tidx < Q1d && data.tidy < Q1d) ? 7447d8d0e25Snbeams qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 7457d8d0e25Snbeams } 7467d8d0e25Snbeams 7477d8d0e25Snbeams //------------------------------------------------------------------------------ 7487d8d0e25Snbeams // 3D quadrature weights 7497d8d0e25Snbeams //------------------------------------------------------------------------------ 7507d8d0e25Snbeams template <int Q1d> 7517d8d0e25Snbeams inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 7527d8d0e25Snbeams const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 7537d8d0e25Snbeams const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 7547d8d0e25Snbeams for (CeedInt z = 0; z < Q1d; ++z) 7557d8d0e25Snbeams w[z] = quad ? pw*qweight1d[z] : 0.0; 7567d8d0e25Snbeams } 7577d8d0e25Snbeams 7587d8d0e25Snbeams ); 7597d8d0e25Snbeams //------------------------------------------------------------------------------ 7607d8d0e25Snbeams // Build singe operator kernel 7617d8d0e25Snbeams //------------------------------------------------------------------------------ 76237c3b1cfSnbeams extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { 7637d8d0e25Snbeams 7647d8d0e25Snbeams using std::ostringstream; 7657d8d0e25Snbeams using std::string; 7667d8d0e25Snbeams int ierr; 7677d8d0e25Snbeams bool setupdone; 768e15f9bd0SJeremy L Thompson ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 769e15f9bd0SJeremy L Thompson if (setupdone) return CEED_ERROR_SUCCESS; 7707d8d0e25Snbeams Ceed ceed; 771e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 7727d8d0e25Snbeams CeedOperator_Hip_gen *data; 773e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 7747d8d0e25Snbeams CeedQFunction qf; 7757d8d0e25Snbeams CeedQFunction_Hip_gen *qf_data; 776e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 777e15f9bd0SJeremy L Thompson ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 778e79b91d9SJeremy L Thompson CeedSize lsize; 77978464608Sjeremylt CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields, 780e79b91d9SJeremy L Thompson numoutputfields, ncomp, dim = 0; 781e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 782e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 7837d8d0e25Snbeams CeedOperatorField *opinputfields, *opoutputfields; 7847e7773b5SJeremy L Thompson ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields); 785e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7867d8d0e25Snbeams CeedQFunctionField *qfinputfields, *qfoutputfields; 7877e7773b5SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 788e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7897d8d0e25Snbeams CeedEvalMode emode; 7907d8d0e25Snbeams CeedBasis basis; 7917d8d0e25Snbeams CeedBasis_Hip_shared *basis_data; 7927d8d0e25Snbeams CeedElemRestriction Erestrict; 7937d8d0e25Snbeams CeedElemRestriction_Hip *restr_data; 7947d8d0e25Snbeams 7950b454692Sjeremylt // Check for restriction only identity operator 7960b454692Sjeremylt bool is_identity_qf; 7970b454692Sjeremylt ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 7980b454692Sjeremylt if (is_identity_qf) { 7990b454692Sjeremylt CeedEvalMode emodein, emodeout; 8000b454692Sjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein); CeedChkBackend(ierr); 8010b454692Sjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout); CeedChkBackend(ierr); 8020b454692Sjeremylt if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE) 8030b454692Sjeremylt // LCOV_EXCL_START 8040b454692Sjeremylt return CeedError(ceed, CEED_ERROR_BACKEND, 8050b454692Sjeremylt "Backend does not implement restriction only identity operators"); 8060b454692Sjeremylt // LCOV_EXCL_STOP 8070b454692Sjeremylt } 8080b454692Sjeremylt 8097d8d0e25Snbeams ostringstream code; 8107d8d0e25Snbeams string devFunctions(deviceFunctions); 8117d8d0e25Snbeams code << devFunctions; 8127d8d0e25Snbeams 8137d8d0e25Snbeams string qFunction(qf_data->qFunctionSource); 8147d8d0e25Snbeams string qFunctionName(qf_data->qFunctionName); 8157d8d0e25Snbeams string oper; 8167d8d0e25Snbeams oper = "CeedKernel_Hip_gen_" + qFunctionName; 8177d8d0e25Snbeams 8187d8d0e25Snbeams code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 81977841947SLeila Ghaffari code << "#define CEED_QFUNCTION_HELPER inline __device__ __forceinline__\n"; 820e15f9bd0SJeremy L Thompson code << "#define CeedPragmaSIMD\n"; 821e15f9bd0SJeremy L Thompson code << "#define CEED_ERROR_SUCCESS 0\n\n"; 8227d8d0e25Snbeams 8237d8d0e25Snbeams // Find dim and Q1d 8247d8d0e25Snbeams bool useCollograd = true; 8257d8d0e25Snbeams data->maxP1d = 0; 8267d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 827e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 8287d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 829e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 8307d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 831e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8327d8d0e25Snbeams 8337d8d0e25Snbeams // Check for collocated gradient 834437930d1SJeremy L Thompson useCollograd = useCollograd && basis_data->d_collo_grad_1d; 8357d8d0e25Snbeams 8367d8d0e25Snbeams // Collect dim and Q1d 837e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 8387d8d0e25Snbeams bool isTensor; 839e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 8407d8d0e25Snbeams if (isTensor) { 841e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 842e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 8437d8d0e25Snbeams if (P1d>data->maxP1d) data->maxP1d = P1d; 8447d8d0e25Snbeams } else { 8457d8d0e25Snbeams // LCOV_EXCL_START 846e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 8477d8d0e25Snbeams // LCOV_EXCL_STOP 8487d8d0e25Snbeams } 8497d8d0e25Snbeams } 8507d8d0e25Snbeams } 8517d8d0e25Snbeams // Check output bases for Q1d, dim as well 8527d8d0e25Snbeams // The only imput basis might be CEED_BASIS_COLLOCATED 8537d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 854e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 8557d8d0e25Snbeams 8567d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 857e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 8587d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 859e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8607d8d0e25Snbeams 8617d8d0e25Snbeams // Collect dim and Q1d 862e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 8637d8d0e25Snbeams bool isTensor; 864e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 8657d8d0e25Snbeams if (isTensor) { 866e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 8677d8d0e25Snbeams } else { 8687d8d0e25Snbeams // LCOV_EXCL_START 869e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 8707d8d0e25Snbeams // LCOV_EXCL_STOP 8717d8d0e25Snbeams } 8727d8d0e25Snbeams 8737d8d0e25Snbeams // Check for collocated gradient 874437930d1SJeremy L Thompson useCollograd = useCollograd && basis_data->d_collo_grad_1d; 8757d8d0e25Snbeams } 8767d8d0e25Snbeams } 8777d8d0e25Snbeams data->dim = dim; 8787d8d0e25Snbeams data->Q1d = Q1d; 8797d8d0e25Snbeams 8807d8d0e25Snbeams // Define CEED_Q_VLA 8817d8d0e25Snbeams if (dim != 3 || useCollograd) { 8827d8d0e25Snbeams code << "\n#define CEED_Q_VLA 1\n\n"; 8837d8d0e25Snbeams } else { 8847d8d0e25Snbeams code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 8857d8d0e25Snbeams } 8867d8d0e25Snbeams 8877d8d0e25Snbeams code << qFunction; 8887d8d0e25Snbeams 8897d8d0e25Snbeams // Setup 8907d8d0e25Snbeams code << "\n// -----------------------------------------------------------------------------\n"; 891b3e1519bSnbeams code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 892b3e1519bSnbeams code << "__global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n"; 8937d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 8947d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 895e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8967d8d0e25Snbeams if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 8977d8d0e25Snbeams code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 8987d8d0e25Snbeams } 8997d8d0e25Snbeams } 9007d8d0e25Snbeams 9017d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 9027d8d0e25Snbeams code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 9037d8d0e25Snbeams } 9047d8d0e25Snbeams 9057d8d0e25Snbeams code << " const CeedInt Dim = "<<dim<<";\n"; 9067d8d0e25Snbeams code << " const CeedInt Q1d = "<<Q1d<<";\n"; 9077d8d0e25Snbeams 9087d8d0e25Snbeams code << " HIP_DYNAMIC_SHARED( CeedScalar, slice)\n"; 9097d8d0e25Snbeams code << " BackendData data;\n"; 9107d8d0e25Snbeams code << " data.tidx = threadIdx.x;\n"; 9117d8d0e25Snbeams code << " data.tidy = threadIdx.y;\n"; 9127d8d0e25Snbeams code << " data.tidz = threadIdx.z;\n"; 9137d8d0e25Snbeams code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 9147d8d0e25Snbeams code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 9157d8d0e25Snbeams 9167d8d0e25Snbeams code << "\n // -- Input field constants and basis data --\n"; 9177d8d0e25Snbeams //Initialize constants, and matrices B and G 9187d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 9197d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 9207d8d0e25Snbeams // Get elemsize, emode, ncomp 9217d8d0e25Snbeams ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 922e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9237d8d0e25Snbeams ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 924e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9257d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 926e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9277d8d0e25Snbeams ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 928e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9297d8d0e25Snbeams 9307d8d0e25Snbeams // Set field constants 9317d8d0e25Snbeams if (emode != CEED_EVAL_WEIGHT) { 932e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 9337d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 934e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 9357d8d0e25Snbeams code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 9367d8d0e25Snbeams } else { 9377d8d0e25Snbeams code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 9387d8d0e25Snbeams } 9397d8d0e25Snbeams code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 9407d8d0e25Snbeams } 9417d8d0e25Snbeams 9427d8d0e25Snbeams // Load basis data 9437d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 9447d8d0e25Snbeams switch (emode) { 9457d8d0e25Snbeams case CEED_EVAL_NONE: 9467d8d0e25Snbeams break; 9477d8d0e25Snbeams case CEED_EVAL_INTERP: 948e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 949437930d1SJeremy L Thompson data->B.in[i] = basis_data->d_interp_1d; 95080a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 9517d8d0e25Snbeams code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 9527d8d0e25Snbeams break; 9537d8d0e25Snbeams case CEED_EVAL_GRAD: 954e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 955437930d1SJeremy L Thompson data->B.in[i] = basis_data->d_interp_1d; 95680a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 9577d8d0e25Snbeams code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 9587d8d0e25Snbeams if (useCollograd) { 959437930d1SJeremy L Thompson data->G.in[i] = basis_data->d_collo_grad_1d; 96080a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 9617d8d0e25Snbeams code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 9627d8d0e25Snbeams } else { 963437930d1SJeremy L Thompson data->G.in[i] = basis_data->d_grad_1d; 96480a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 9657d8d0e25Snbeams code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 9667d8d0e25Snbeams } 9677d8d0e25Snbeams break; 9687d8d0e25Snbeams case CEED_EVAL_WEIGHT: 9697d8d0e25Snbeams break; // No action 9707d8d0e25Snbeams case CEED_EVAL_DIV: 9717d8d0e25Snbeams break; // TODO: Not implemented 9727d8d0e25Snbeams case CEED_EVAL_CURL: 9737d8d0e25Snbeams break; // TODO: Not implemented 9747d8d0e25Snbeams } 9757d8d0e25Snbeams } 9767d8d0e25Snbeams 9777d8d0e25Snbeams code << "\n // -- Output field constants and basis data --\n"; 9787d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 9797d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 9807d8d0e25Snbeams // Get elemsize, emode, ncomp 9817d8d0e25Snbeams ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 982e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9837d8d0e25Snbeams ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 984e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9857d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 986e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9877d8d0e25Snbeams ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 988e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9897d8d0e25Snbeams 9907d8d0e25Snbeams // Set field constants 991e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 9927d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 993e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 9947d8d0e25Snbeams code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 9957d8d0e25Snbeams } else { 9967d8d0e25Snbeams code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 9977d8d0e25Snbeams } 9987d8d0e25Snbeams code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 9997d8d0e25Snbeams 10007d8d0e25Snbeams // Load basis data 10017d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 10027d8d0e25Snbeams switch (emode) { 10037d8d0e25Snbeams case CEED_EVAL_NONE: 10047d8d0e25Snbeams break; // No action 10057d8d0e25Snbeams case CEED_EVAL_INTERP: 1006e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1007437930d1SJeremy L Thompson data->B.out[i] = basis_data->d_interp_1d; 100880a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 10097d8d0e25Snbeams code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 10107d8d0e25Snbeams break; 10117d8d0e25Snbeams case CEED_EVAL_GRAD: 1012e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1013437930d1SJeremy L Thompson data->B.out[i] = basis_data->d_interp_1d; 101480a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 10157d8d0e25Snbeams code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 10167d8d0e25Snbeams if (useCollograd) { 1017437930d1SJeremy L Thompson data->G.out[i] = basis_data->d_collo_grad_1d; 101880a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 10197d8d0e25Snbeams code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 10207d8d0e25Snbeams } else { 1021437930d1SJeremy L Thompson data->G.out[i] = basis_data->d_grad_1d; 102280a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 10237d8d0e25Snbeams code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 10247d8d0e25Snbeams } 10257d8d0e25Snbeams break; 10267d8d0e25Snbeams // LCOV_EXCL_START 10277d8d0e25Snbeams case CEED_EVAL_WEIGHT: { 10287d8d0e25Snbeams Ceed ceed; 1029e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1030e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 10317d8d0e25Snbeams "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 10327d8d0e25Snbeams break; // Should not occur 10337d8d0e25Snbeams } 10347d8d0e25Snbeams case CEED_EVAL_DIV: 10357d8d0e25Snbeams break; // TODO: Not implemented 10367d8d0e25Snbeams case CEED_EVAL_CURL: 10377d8d0e25Snbeams break; // TODO: Not implemented 10387d8d0e25Snbeams // LCOV_EXCL_STOP 10397d8d0e25Snbeams } 10407d8d0e25Snbeams } 10417d8d0e25Snbeams code << "\n // -- Element loop --\n"; 10427d8d0e25Snbeams code << " __syncthreads();\n"; 10437d8d0e25Snbeams code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 10447d8d0e25Snbeams // Input basis apply if needed 10457d8d0e25Snbeams // Generate the correct eval mode code for each input 10467d8d0e25Snbeams code << " // -- Input field restrictions and basis actions --\n"; 10477d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 10487d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 10497d8d0e25Snbeams // Get elemsize, emode, ncomp 10507d8d0e25Snbeams ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1051e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10527d8d0e25Snbeams ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1053e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10547d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1055e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10567d8d0e25Snbeams ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1057e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10587d8d0e25Snbeams 10597d8d0e25Snbeams // Restriction 10607d8d0e25Snbeams if (emode != CEED_EVAL_WEIGHT && 10617d8d0e25Snbeams !((emode == CEED_EVAL_NONE) && useCollograd)) { 10627d8d0e25Snbeams code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 10637d8d0e25Snbeams 10647d8d0e25Snbeams bool isStrided; 1065e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 10667d8d0e25Snbeams if (!isStrided) { 10677d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1068e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10697d8d0e25Snbeams code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 10707d8d0e25Snbeams CeedInt compstride; 1071e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 10727d8d0e25Snbeams code << " // CompStride: "<<compstride<<"\n"; 1073e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 10747d8d0e25Snbeams data->indices.in[i] = restr_data->d_ind; 10757d8d0e25Snbeams code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 10767d8d0e25Snbeams } else { 10777d8d0e25Snbeams bool backendstrides; 10787d8d0e25Snbeams ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1079e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10807d8d0e25Snbeams CeedInt nelem; 10817d8d0e25Snbeams ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1082e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10837d8d0e25Snbeams CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 10847d8d0e25Snbeams if (!backendstrides) { 10857d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1086e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10877d8d0e25Snbeams } 10887d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 10897d8d0e25Snbeams code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 10907d8d0e25Snbeams } 10917d8d0e25Snbeams } 10927d8d0e25Snbeams 10937d8d0e25Snbeams // Basis action 10947d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 10957d8d0e25Snbeams switch (emode) { 10967d8d0e25Snbeams case CEED_EVAL_NONE: 10977d8d0e25Snbeams if (!useCollograd) { 10987d8d0e25Snbeams code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 10997d8d0e25Snbeams } 11007d8d0e25Snbeams break; 11017d8d0e25Snbeams case CEED_EVAL_INTERP: 11027d8d0e25Snbeams code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 11037d8d0e25Snbeams code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 11047d8d0e25Snbeams break; 11057d8d0e25Snbeams case CEED_EVAL_GRAD: 11067d8d0e25Snbeams if (useCollograd) { 11077d8d0e25Snbeams code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 11087d8d0e25Snbeams code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 11097d8d0e25Snbeams } else { 11107d8d0e25Snbeams code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 11117d8d0e25Snbeams code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 11127d8d0e25Snbeams } 11137d8d0e25Snbeams break; 11147d8d0e25Snbeams case CEED_EVAL_WEIGHT: 11157d8d0e25Snbeams code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1116e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 1117e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1118437930d1SJeremy L Thompson data->W = basis_data->d_q_weight_1d; 11197d8d0e25Snbeams code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 11207d8d0e25Snbeams break; // No action 11217d8d0e25Snbeams case CEED_EVAL_DIV: 11227d8d0e25Snbeams break; // TODO: Not implemented 11237d8d0e25Snbeams case CEED_EVAL_CURL: 11247d8d0e25Snbeams break; // TODO: Not implemented 11257d8d0e25Snbeams } 11267d8d0e25Snbeams } 11277d8d0e25Snbeams 11287d8d0e25Snbeams // Q function 11297d8d0e25Snbeams code << "\n // -- Output field setup --\n"; 11307d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 11317d8d0e25Snbeams code << "\n // ---- Output field "<<i<<" ----\n"; 11327d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1133e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 11347d8d0e25Snbeams if (emode==CEED_EVAL_GRAD) 11357d8d0e25Snbeams { 11367d8d0e25Snbeams if (useCollograd) { 11377d8d0e25Snbeams //Accumulator for gradient slices 11387d8d0e25Snbeams code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 11397d8d0e25Snbeams code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 11407d8d0e25Snbeams code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 11417d8d0e25Snbeams code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 11427d8d0e25Snbeams code << " }\n"; 11437d8d0e25Snbeams code << " }\n"; 11447d8d0e25Snbeams } else { 11457d8d0e25Snbeams code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 11467d8d0e25Snbeams } 11477d8d0e25Snbeams } 11487d8d0e25Snbeams if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 11497d8d0e25Snbeams { 11507d8d0e25Snbeams code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 11517d8d0e25Snbeams } 11527d8d0e25Snbeams } 11537d8d0e25Snbeams // We treat quadrature points per slice in 3d to save registers 11547d8d0e25Snbeams if (useCollograd) { 11557d8d0e25Snbeams code << "\n // Note: Collocated Gradient\n"; 11567d8d0e25Snbeams code << "#pragma unroll\n"; 11577d8d0e25Snbeams code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 11587d8d0e25Snbeams code << " // -- Input fields --\n"; 11597d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 11607d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 11617d8d0e25Snbeams // Get elemsize, emode, ncomp 11627d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1163e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 11647d8d0e25Snbeams // Basis action 11657d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 11667d8d0e25Snbeams switch (emode) { 11677d8d0e25Snbeams case CEED_EVAL_NONE: 11687d8d0e25Snbeams code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 11697d8d0e25Snbeams 11707d8d0e25Snbeams bool isStrided; 1171e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr); 1172e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr); 1173e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 11747d8d0e25Snbeams if (!isStrided) { 11757d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1176e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 11777d8d0e25Snbeams code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 11787d8d0e25Snbeams CeedInt compstride; 1179e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 11807d8d0e25Snbeams code << " // CompStride: "<<compstride<<"\n"; 1181e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 11827d8d0e25Snbeams data->indices.in[i] = restr_data->d_ind; 11837d8d0e25Snbeams code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 11847d8d0e25Snbeams } else { 11857d8d0e25Snbeams bool backendstrides; 11867d8d0e25Snbeams ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1187e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 11887d8d0e25Snbeams CeedInt nelem; 11897d8d0e25Snbeams ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1190e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 11917d8d0e25Snbeams CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 11927d8d0e25Snbeams if (!backendstrides) { 11937d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1194e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 11957d8d0e25Snbeams } 11967d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 11977d8d0e25Snbeams code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 11987d8d0e25Snbeams } 11997d8d0e25Snbeams break; 12007d8d0e25Snbeams case CEED_EVAL_INTERP: 12017d8d0e25Snbeams code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 12027d8d0e25Snbeams code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 12037d8d0e25Snbeams code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 12047d8d0e25Snbeams code << " }\n"; 12057d8d0e25Snbeams break; 12067d8d0e25Snbeams case CEED_EVAL_GRAD: 12077d8d0e25Snbeams code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 12087d8d0e25Snbeams code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 12097d8d0e25Snbeams break; 12107d8d0e25Snbeams case CEED_EVAL_WEIGHT: 12117d8d0e25Snbeams code << " CeedScalar r_q"<<i<<"[1];\n"; 12127d8d0e25Snbeams code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 12137d8d0e25Snbeams break; // No action 12147d8d0e25Snbeams case CEED_EVAL_DIV: 12157d8d0e25Snbeams break; // TODO: Not implemented 12167d8d0e25Snbeams case CEED_EVAL_CURL: 12177d8d0e25Snbeams break; // TODO: Not implemented 12187d8d0e25Snbeams } 12197d8d0e25Snbeams } 12207d8d0e25Snbeams code << "\n // -- Output fields --\n"; 12217d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 12227d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 12237d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1224e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 12257d8d0e25Snbeams // Basis action 12267d8d0e25Snbeams switch (emode) { 12277d8d0e25Snbeams case CEED_EVAL_NONE: 12287d8d0e25Snbeams code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 12297d8d0e25Snbeams break; // No action 12307d8d0e25Snbeams case CEED_EVAL_INTERP: 12317d8d0e25Snbeams code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 12327d8d0e25Snbeams break; 12337d8d0e25Snbeams case CEED_EVAL_GRAD: 12347d8d0e25Snbeams code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 12357d8d0e25Snbeams break; 12367d8d0e25Snbeams case CEED_EVAL_WEIGHT: 12377d8d0e25Snbeams break; // Should not occur 12387d8d0e25Snbeams case CEED_EVAL_DIV: 12397d8d0e25Snbeams break; // TODO: Not implemented 12407d8d0e25Snbeams case CEED_EVAL_CURL: 12417d8d0e25Snbeams break; // TODO: Not implemented 12427d8d0e25Snbeams } 12437d8d0e25Snbeams } 12447d8d0e25Snbeams } else { 12457d8d0e25Snbeams code << "\n // Note: No Collocated Gradient\n"; 12467d8d0e25Snbeams code << " // -- Input fields --\n"; 12477d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 12487d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 12497d8d0e25Snbeams code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 12507d8d0e25Snbeams } 12517d8d0e25Snbeams code << " // -- Output fields --\n"; 12527d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 12537d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 12547d8d0e25Snbeams code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 12557d8d0e25Snbeams } 12567d8d0e25Snbeams } 12577d8d0e25Snbeams code << "\n // -- QFunction Inputs and outputs --\n"; 12587d8d0e25Snbeams code << " CeedScalar* in["<<numinputfields<<"];\n"; 12597d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 12607d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 12617d8d0e25Snbeams code << " in["<<i<<"] = r_q"<<i<<";\n"; 12627d8d0e25Snbeams } 12637d8d0e25Snbeams code << " CeedScalar* out["<<numoutputfields<<"];\n"; 12647d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 12657d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 12667d8d0e25Snbeams code << " out["<<i<<"] = r_qq"<<i<<";\n"; 12677d8d0e25Snbeams } 12687d8d0e25Snbeams code << "\n // -- Apply QFunction --\n"; 12697d8d0e25Snbeams code << " "<<qFunctionName<<"(ctx, "; 12707d8d0e25Snbeams if (dim != 3 || useCollograd) { 12717d8d0e25Snbeams code << "1"; 12727d8d0e25Snbeams } else { 12737d8d0e25Snbeams code << "Q1d"; 12747d8d0e25Snbeams } 12757d8d0e25Snbeams code << ", in, out);\n"; 12767d8d0e25Snbeams if (useCollograd) { 12777d8d0e25Snbeams code << "\n // Note: Collocated Gradient\n"; 12787d8d0e25Snbeams code << " // -- Output fields --\n"; 12797d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 12807d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 12817d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1282e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 12837d8d0e25Snbeams // Basis action 12847d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 12857d8d0e25Snbeams switch (emode) { 12867d8d0e25Snbeams case CEED_EVAL_NONE: 12877d8d0e25Snbeams code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 12887d8d0e25Snbeams code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 12897d8d0e25Snbeams code << " }\n"; 12907d8d0e25Snbeams break; // No action 12917d8d0e25Snbeams case CEED_EVAL_INTERP: 12927d8d0e25Snbeams code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 12937d8d0e25Snbeams code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 12947d8d0e25Snbeams code << " }\n"; 12957d8d0e25Snbeams break; 12967d8d0e25Snbeams case CEED_EVAL_GRAD: 12977d8d0e25Snbeams code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 12987d8d0e25Snbeams break; 12997d8d0e25Snbeams case CEED_EVAL_WEIGHT: 13007d8d0e25Snbeams break; // Should not occur 13017d8d0e25Snbeams case CEED_EVAL_DIV: 13027d8d0e25Snbeams break; // TODO: Not implemented 13037d8d0e25Snbeams case CEED_EVAL_CURL: 13047d8d0e25Snbeams break; // TODO: Not implemented 13057d8d0e25Snbeams } 13067d8d0e25Snbeams } 13077d8d0e25Snbeams code << " }\n"; 13087d8d0e25Snbeams } 13097d8d0e25Snbeams 13107d8d0e25Snbeams // Output basis apply if needed 13117d8d0e25Snbeams // Generate the correct eval mode code for each output 13127d8d0e25Snbeams code << "\n // -- Output field basis action and restrictions --\n"; 13137d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 13147d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 13157d8d0e25Snbeams // Get elemsize, emode, ncomp 13167d8d0e25Snbeams ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1317e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13187d8d0e25Snbeams ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1319e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13207d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1321e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13227d8d0e25Snbeams ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1323e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13247d8d0e25Snbeams // Basis action 13257d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 13267d8d0e25Snbeams switch (emode) { 13277d8d0e25Snbeams case CEED_EVAL_NONE: 13287d8d0e25Snbeams code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 13297d8d0e25Snbeams break; // No action 13307d8d0e25Snbeams case CEED_EVAL_INTERP: 13317d8d0e25Snbeams code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 13327d8d0e25Snbeams code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 13337d8d0e25Snbeams break; 13347d8d0e25Snbeams case CEED_EVAL_GRAD: 13357d8d0e25Snbeams code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 13367d8d0e25Snbeams if (useCollograd) { 13377d8d0e25Snbeams code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 13387d8d0e25Snbeams } else { 13397d8d0e25Snbeams code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 13407d8d0e25Snbeams } 13417d8d0e25Snbeams break; 13427d8d0e25Snbeams // LCOV_EXCL_START 13437d8d0e25Snbeams case CEED_EVAL_WEIGHT: { 13447d8d0e25Snbeams Ceed ceed; 1345e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1346e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 13477d8d0e25Snbeams "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 13487d8d0e25Snbeams break; // Should not occur 13497d8d0e25Snbeams } 13507d8d0e25Snbeams case CEED_EVAL_DIV: 13517d8d0e25Snbeams break; // TODO: Not implemented 13527d8d0e25Snbeams case CEED_EVAL_CURL: 13537d8d0e25Snbeams break; // TODO: Not implemented 13547d8d0e25Snbeams // LCOV_EXCL_STOP 13557d8d0e25Snbeams } 13567d8d0e25Snbeams // Restriction 13577d8d0e25Snbeams bool isStrided; 1358e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 13597d8d0e25Snbeams if (!isStrided) { 13607d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1361e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13627d8d0e25Snbeams code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 13637d8d0e25Snbeams CeedInt compstride; 1364e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 13657d8d0e25Snbeams code << " // CompStride: "<<compstride<<"\n"; 1366e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 13677d8d0e25Snbeams data->indices.out[i] = restr_data->d_ind; 13687d8d0e25Snbeams code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 13697d8d0e25Snbeams } else { 13707d8d0e25Snbeams bool backendstrides; 13717d8d0e25Snbeams ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1372e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13737d8d0e25Snbeams CeedInt nelem; 13747d8d0e25Snbeams ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1375e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13767d8d0e25Snbeams CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 13777d8d0e25Snbeams if (!backendstrides) { 13787d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1379e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13807d8d0e25Snbeams } 13817d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 13827d8d0e25Snbeams code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 13837d8d0e25Snbeams } 13847d8d0e25Snbeams } 13857d8d0e25Snbeams 13867d8d0e25Snbeams code << " }\n"; 13877d8d0e25Snbeams code << "}\n"; 13887d8d0e25Snbeams code << "// -----------------------------------------------------------------------------\n\n"; 13897d8d0e25Snbeams 13907d8d0e25Snbeams // View kernel for debugging 139146dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 13923f21f6b1SJeremy L Thompson CeedDebug(ceed, code.str().c_str()); 13937d8d0e25Snbeams 1394*539ec17dSJeremy L Thompson CeedInt block_sizes[3] = {0, 0, 0}; 139537c3b1cfSnbeams ierr = BlockGridCalculate_Hip_gen(dim, numelements, data->maxP1d, Q1d, block_sizes); 1396b3e1519bSnbeams CeedChkBackend(ierr); 1397b3e1519bSnbeams ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2, 1398b3e1519bSnbeams "T1d", block_sizes[0], 1399b3e1519bSnbeams "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]); 1400e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 14017d8d0e25Snbeams ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op); 1402e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 14037d8d0e25Snbeams 1404e15f9bd0SJeremy L Thompson ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 1405e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 14067d8d0e25Snbeams } 14077d8d0e25Snbeams //------------------------------------------------------------------------------ 1408