1*7d8d0e25Snbeams // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2*7d8d0e25Snbeams // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3*7d8d0e25Snbeams // All Rights reserved. See files LICENSE and NOTICE for details. 4*7d8d0e25Snbeams // 5*7d8d0e25Snbeams // This file is part of CEED, a collection of benchmarks, miniapps, software 6*7d8d0e25Snbeams // libraries and APIs for efficient high-order finite element and spectral 7*7d8d0e25Snbeams // element discretizations for exascale applications. For more information and 8*7d8d0e25Snbeams // source code availability see http://github.com/ceed. 9*7d8d0e25Snbeams // 10*7d8d0e25Snbeams // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11*7d8d0e25Snbeams // a collaborative effort of two U.S. Department of Energy organizations (Office 12*7d8d0e25Snbeams // of Science and the National Nuclear Security Administration) responsible for 13*7d8d0e25Snbeams // the planning and preparation of a capable exascale ecosystem, including 14*7d8d0e25Snbeams // software, applications, hardware, advanced system engineering and early 15*7d8d0e25Snbeams // testbed platforms, in support of the nation's exascale computing imperative. 16*7d8d0e25Snbeams #define CEED_DEBUG_COLOR 12 17*7d8d0e25Snbeams 18*7d8d0e25Snbeams #include "ceed-hip-gen.h" 19*7d8d0e25Snbeams #include <iostream> 20*7d8d0e25Snbeams #include <sstream> 21*7d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h" 22*7d8d0e25Snbeams #include "../hip/ceed-hip-compile.h" 23*7d8d0e25Snbeams 24*7d8d0e25Snbeams static const char *atomicAdd = QUOTE( 25*7d8d0e25Snbeams //------------------------------------------------------------------------------ 26*7d8d0e25Snbeams // Atomic add, for older CUDA 27*7d8d0e25Snbeams //------------------------------------------------------------------------------ 28*7d8d0e25Snbeams __device__ double atomicAdd(double *address, double val) { 29*7d8d0e25Snbeams unsigned long long int *address_as_ull = (unsigned long long int *)address; 30*7d8d0e25Snbeams unsigned long long int old = *address_as_ull, assumed; 31*7d8d0e25Snbeams do { 32*7d8d0e25Snbeams assumed = old; 33*7d8d0e25Snbeams old = 34*7d8d0e25Snbeams atomicCAS(address_as_ull, assumed, 35*7d8d0e25Snbeams __double_as_longlong(val + 36*7d8d0e25Snbeams __longlong_as_double(assumed))); 37*7d8d0e25Snbeams // Note: uses integer comparison to avoid hang in case of NaN 38*7d8d0e25Snbeams // (since NaN != NaN) 39*7d8d0e25Snbeams } while (assumed != old); 40*7d8d0e25Snbeams return __longlong_as_double(old); 41*7d8d0e25Snbeams } 42*7d8d0e25Snbeams ); 43*7d8d0e25Snbeams 44*7d8d0e25Snbeams static const char *deviceFunctions = QUOTE( 45*7d8d0e25Snbeams 46*7d8d0e25Snbeams //------------------------------------------------------------------------------ 47*7d8d0e25Snbeams // Typedefs 48*7d8d0e25Snbeams //------------------------------------------------------------------------------ 49*7d8d0e25Snbeams typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } HipFields; 50*7d8d0e25Snbeams typedef struct { CeedInt* in[16]; CeedInt* out[16]; } HipFieldsInt; 51*7d8d0e25Snbeams 52*7d8d0e25Snbeams typedef struct { 53*7d8d0e25Snbeams CeedInt tidx; 54*7d8d0e25Snbeams CeedInt tidy; 55*7d8d0e25Snbeams CeedInt tidz; 56*7d8d0e25Snbeams CeedInt tid; 57*7d8d0e25Snbeams CeedScalar* slice; 58*7d8d0e25Snbeams } BackendData; 59*7d8d0e25Snbeams 60*7d8d0e25Snbeams //------------------------------------------------------------------------------ 61*7d8d0e25Snbeams // Load matrices for basis actions 62*7d8d0e25Snbeams //------------------------------------------------------------------------------ 63*7d8d0e25Snbeams template <int P, int Q> 64*7d8d0e25Snbeams inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 65*7d8d0e25Snbeams for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 66*7d8d0e25Snbeams B[i] = d_B[i]; 67*7d8d0e25Snbeams } 68*7d8d0e25Snbeams 69*7d8d0e25Snbeams //------------------------------------------------------------------------------ 70*7d8d0e25Snbeams // 1D 71*7d8d0e25Snbeams //------------------------------------------------------------------------------ 72*7d8d0e25Snbeams 73*7d8d0e25Snbeams //------------------------------------------------------------------------------ 74*7d8d0e25Snbeams // L-vector -> E-vector, offsets provided 75*7d8d0e25Snbeams //------------------------------------------------------------------------------ 76*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 77*7d8d0e25Snbeams inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 78*7d8d0e25Snbeams if (data.tidx < P1d) { 79*7d8d0e25Snbeams const CeedInt node = data.tidx; 80*7d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d]; 81*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 82*7d8d0e25Snbeams r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 83*7d8d0e25Snbeams } 84*7d8d0e25Snbeams } 85*7d8d0e25Snbeams 86*7d8d0e25Snbeams //------------------------------------------------------------------------------ 87*7d8d0e25Snbeams // L-vector -> E-vector, strided 88*7d8d0e25Snbeams //------------------------------------------------------------------------------ 89*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 90*7d8d0e25Snbeams inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 91*7d8d0e25Snbeams if (data.tidx < P1d) { 92*7d8d0e25Snbeams const CeedInt node = data.tidx; 93*7d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 94*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 95*7d8d0e25Snbeams r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 96*7d8d0e25Snbeams } 97*7d8d0e25Snbeams } 98*7d8d0e25Snbeams 99*7d8d0e25Snbeams //------------------------------------------------------------------------------ 100*7d8d0e25Snbeams // E-vector -> L-vector, offsets provided 101*7d8d0e25Snbeams //------------------------------------------------------------------------------ 102*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 103*7d8d0e25Snbeams inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 104*7d8d0e25Snbeams if (data.tidx < P1d) { 105*7d8d0e25Snbeams const CeedInt node = data.tidx; 106*7d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d]; 107*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 108*7d8d0e25Snbeams atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 109*7d8d0e25Snbeams } 110*7d8d0e25Snbeams } 111*7d8d0e25Snbeams 112*7d8d0e25Snbeams //------------------------------------------------------------------------------ 113*7d8d0e25Snbeams // E-vector -> L-vector, strided 114*7d8d0e25Snbeams //------------------------------------------------------------------------------ 115*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 116*7d8d0e25Snbeams inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 117*7d8d0e25Snbeams if (data.tidx < P1d) { 118*7d8d0e25Snbeams const CeedInt node = data.tidx; 119*7d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 120*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 121*7d8d0e25Snbeams d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 122*7d8d0e25Snbeams } 123*7d8d0e25Snbeams } 124*7d8d0e25Snbeams 125*7d8d0e25Snbeams //------------------------------------------------------------------------------ 126*7d8d0e25Snbeams // 1D tensor contraction x 127*7d8d0e25Snbeams //------------------------------------------------------------------------------ 128*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 129*7d8d0e25Snbeams inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 130*7d8d0e25Snbeams data.slice[data.tidx] = *U; 131*7d8d0e25Snbeams __syncthreads(); 132*7d8d0e25Snbeams *V = 0.0; 133*7d8d0e25Snbeams if (data.tidx < Q1d) 134*7d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 135*7d8d0e25Snbeams *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 136*7d8d0e25Snbeams __syncthreads(); 137*7d8d0e25Snbeams } 138*7d8d0e25Snbeams 139*7d8d0e25Snbeams //------------------------------------------------------------------------------ 140*7d8d0e25Snbeams // 1D transpose tensor contraction x 141*7d8d0e25Snbeams //------------------------------------------------------------------------------ 142*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 143*7d8d0e25Snbeams inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 144*7d8d0e25Snbeams data.slice[data.tidx] = *U; 145*7d8d0e25Snbeams __syncthreads(); 146*7d8d0e25Snbeams *V = 0.0; 147*7d8d0e25Snbeams if (data.tidx < P1d) 148*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 149*7d8d0e25Snbeams *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 150*7d8d0e25Snbeams __syncthreads(); 151*7d8d0e25Snbeams } 152*7d8d0e25Snbeams 153*7d8d0e25Snbeams //------------------------------------------------------------------------------ 154*7d8d0e25Snbeams // 1D interpolate to quadrature points 155*7d8d0e25Snbeams //------------------------------------------------------------------------------ 156*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 157*7d8d0e25Snbeams inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 158*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) 159*7d8d0e25Snbeams ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 160*7d8d0e25Snbeams } 161*7d8d0e25Snbeams 162*7d8d0e25Snbeams //------------------------------------------------------------------------------ 163*7d8d0e25Snbeams // 1D interpolate transpose 164*7d8d0e25Snbeams //------------------------------------------------------------------------------ 165*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 166*7d8d0e25Snbeams inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 167*7d8d0e25Snbeams for (CeedInt comp=0; comp<NCOMP; comp++) 168*7d8d0e25Snbeams ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 169*7d8d0e25Snbeams } 170*7d8d0e25Snbeams 171*7d8d0e25Snbeams //------------------------------------------------------------------------------ 172*7d8d0e25Snbeams // 1D derivatives at quadrature points 173*7d8d0e25Snbeams //------------------------------------------------------------------------------ 174*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 175*7d8d0e25Snbeams inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 176*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) 177*7d8d0e25Snbeams ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 178*7d8d0e25Snbeams } 179*7d8d0e25Snbeams 180*7d8d0e25Snbeams //------------------------------------------------------------------------------ 181*7d8d0e25Snbeams // 1D derivatives transpose 182*7d8d0e25Snbeams //------------------------------------------------------------------------------ 183*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 184*7d8d0e25Snbeams inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 185*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) 186*7d8d0e25Snbeams ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 187*7d8d0e25Snbeams } 188*7d8d0e25Snbeams 189*7d8d0e25Snbeams //------------------------------------------------------------------------------ 190*7d8d0e25Snbeams // 2D 191*7d8d0e25Snbeams //------------------------------------------------------------------------------ 192*7d8d0e25Snbeams 193*7d8d0e25Snbeams //------------------------------------------------------------------------------ 194*7d8d0e25Snbeams // L-vector -> E-vector, offsets provided 195*7d8d0e25Snbeams //------------------------------------------------------------------------------ 196*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 197*7d8d0e25Snbeams inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 198*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) { 199*7d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d; 200*7d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d*P1d]; 201*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 202*7d8d0e25Snbeams r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 203*7d8d0e25Snbeams } 204*7d8d0e25Snbeams } 205*7d8d0e25Snbeams 206*7d8d0e25Snbeams //------------------------------------------------------------------------------ 207*7d8d0e25Snbeams // L-vector -> E-vector, strided 208*7d8d0e25Snbeams //------------------------------------------------------------------------------ 209*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 210*7d8d0e25Snbeams inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 211*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) { 212*7d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d; 213*7d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 214*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 215*7d8d0e25Snbeams r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 216*7d8d0e25Snbeams } 217*7d8d0e25Snbeams } 218*7d8d0e25Snbeams 219*7d8d0e25Snbeams //------------------------------------------------------------------------------ 220*7d8d0e25Snbeams // E-vector -> L-vector, offsets provided 221*7d8d0e25Snbeams //------------------------------------------------------------------------------ 222*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 223*7d8d0e25Snbeams inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 224*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) { 225*7d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d; 226*7d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d*P1d]; 227*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 228*7d8d0e25Snbeams atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 229*7d8d0e25Snbeams } 230*7d8d0e25Snbeams } 231*7d8d0e25Snbeams 232*7d8d0e25Snbeams //------------------------------------------------------------------------------ 233*7d8d0e25Snbeams // E-vector -> L-vector, strided 234*7d8d0e25Snbeams //------------------------------------------------------------------------------ 235*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 236*7d8d0e25Snbeams inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 237*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) { 238*7d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d; 239*7d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 240*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 241*7d8d0e25Snbeams d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 242*7d8d0e25Snbeams } 243*7d8d0e25Snbeams } 244*7d8d0e25Snbeams 245*7d8d0e25Snbeams //------------------------------------------------------------------------------ 246*7d8d0e25Snbeams // 2D tensor contraction x 247*7d8d0e25Snbeams //------------------------------------------------------------------------------ 248*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 249*7d8d0e25Snbeams inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 250*7d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = *U; 251*7d8d0e25Snbeams __syncthreads(); 252*7d8d0e25Snbeams *V = 0.0; 253*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < P1d) 254*7d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 255*7d8d0e25Snbeams *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 256*7d8d0e25Snbeams __syncthreads(); 257*7d8d0e25Snbeams } 258*7d8d0e25Snbeams 259*7d8d0e25Snbeams //------------------------------------------------------------------------------ 260*7d8d0e25Snbeams // 2D tensor contract y 261*7d8d0e25Snbeams //------------------------------------------------------------------------------ 262*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 263*7d8d0e25Snbeams inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 264*7d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = *U; 265*7d8d0e25Snbeams __syncthreads(); 266*7d8d0e25Snbeams *V = 0.0; 267*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) 268*7d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 269*7d8d0e25Snbeams *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 270*7d8d0e25Snbeams __syncthreads(); 271*7d8d0e25Snbeams } 272*7d8d0e25Snbeams 273*7d8d0e25Snbeams //------------------------------------------------------------------------------ 274*7d8d0e25Snbeams // 2D transpose tensor contract y 275*7d8d0e25Snbeams //------------------------------------------------------------------------------ 276*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 277*7d8d0e25Snbeams inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 278*7d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = *U; 279*7d8d0e25Snbeams __syncthreads(); 280*7d8d0e25Snbeams *V = 0.0; 281*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < P1d) 282*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 283*7d8d0e25Snbeams *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 284*7d8d0e25Snbeams __syncthreads(); 285*7d8d0e25Snbeams } 286*7d8d0e25Snbeams 287*7d8d0e25Snbeams //------------------------------------------------------------------------------ 288*7d8d0e25Snbeams // 2D transpose tensor contract x 289*7d8d0e25Snbeams //------------------------------------------------------------------------------ 290*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 291*7d8d0e25Snbeams inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 292*7d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = *U; 293*7d8d0e25Snbeams __syncthreads(); 294*7d8d0e25Snbeams *V = 0.0; 295*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 296*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 297*7d8d0e25Snbeams *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 298*7d8d0e25Snbeams __syncthreads(); 299*7d8d0e25Snbeams } 300*7d8d0e25Snbeams 301*7d8d0e25Snbeams //------------------------------------------------------------------------------ 302*7d8d0e25Snbeams // 2D transpose tensor contract and add x 303*7d8d0e25Snbeams //------------------------------------------------------------------------------ 304*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 305*7d8d0e25Snbeams inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 306*7d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = *U; 307*7d8d0e25Snbeams __syncthreads(); 308*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 309*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 310*7d8d0e25Snbeams *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 311*7d8d0e25Snbeams __syncthreads(); 312*7d8d0e25Snbeams } 313*7d8d0e25Snbeams 314*7d8d0e25Snbeams //------------------------------------------------------------------------------ 315*7d8d0e25Snbeams // 2D interpolate to quadrature points 316*7d8d0e25Snbeams //------------------------------------------------------------------------------ 317*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 318*7d8d0e25Snbeams inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 319*7d8d0e25Snbeams CeedScalar r_t[1]; 320*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 321*7d8d0e25Snbeams ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 322*7d8d0e25Snbeams ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 323*7d8d0e25Snbeams } 324*7d8d0e25Snbeams } 325*7d8d0e25Snbeams 326*7d8d0e25Snbeams //------------------------------------------------------------------------------ 327*7d8d0e25Snbeams // 2D interpolate transpose 328*7d8d0e25Snbeams //------------------------------------------------------------------------------ 329*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 330*7d8d0e25Snbeams inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 331*7d8d0e25Snbeams CeedScalar r_t[1]; 332*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 333*7d8d0e25Snbeams ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 334*7d8d0e25Snbeams ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 335*7d8d0e25Snbeams } 336*7d8d0e25Snbeams } 337*7d8d0e25Snbeams 338*7d8d0e25Snbeams //------------------------------------------------------------------------------ 339*7d8d0e25Snbeams // 2D derivatives at quadrature points 340*7d8d0e25Snbeams //------------------------------------------------------------------------------ 341*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 342*7d8d0e25Snbeams inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 343*7d8d0e25Snbeams CeedScalar r_t[1]; 344*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 345*7d8d0e25Snbeams ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 346*7d8d0e25Snbeams ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 347*7d8d0e25Snbeams ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 348*7d8d0e25Snbeams ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 349*7d8d0e25Snbeams } 350*7d8d0e25Snbeams } 351*7d8d0e25Snbeams 352*7d8d0e25Snbeams //------------------------------------------------------------------------------ 353*7d8d0e25Snbeams // 2D derivatives transpose 354*7d8d0e25Snbeams //------------------------------------------------------------------------------ 355*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 356*7d8d0e25Snbeams inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 357*7d8d0e25Snbeams CeedScalar r_t[1]; 358*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 359*7d8d0e25Snbeams ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 360*7d8d0e25Snbeams ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 361*7d8d0e25Snbeams ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 362*7d8d0e25Snbeams ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 363*7d8d0e25Snbeams } 364*7d8d0e25Snbeams } 365*7d8d0e25Snbeams 366*7d8d0e25Snbeams //------------------------------------------------------------------------------ 367*7d8d0e25Snbeams // 3D 368*7d8d0e25Snbeams //------------------------------------------------------------------------------ 369*7d8d0e25Snbeams 370*7d8d0e25Snbeams //------------------------------------------------------------------------------ 371*7d8d0e25Snbeams // L-vector -> E-vector, offsets provided 372*7d8d0e25Snbeams //------------------------------------------------------------------------------ 373*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 374*7d8d0e25Snbeams inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 375*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 376*7d8d0e25Snbeams for (CeedInt z = 0; z < P1d; ++z) { 377*7d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 378*7d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 379*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 380*7d8d0e25Snbeams r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 381*7d8d0e25Snbeams } 382*7d8d0e25Snbeams } 383*7d8d0e25Snbeams 384*7d8d0e25Snbeams //------------------------------------------------------------------------------ 385*7d8d0e25Snbeams // L-vector -> E-vector, strided 386*7d8d0e25Snbeams //------------------------------------------------------------------------------ 387*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 388*7d8d0e25Snbeams inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 389*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 390*7d8d0e25Snbeams for (CeedInt z = 0; z < P1d; ++z) { 391*7d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 392*7d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 393*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 394*7d8d0e25Snbeams r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 395*7d8d0e25Snbeams } 396*7d8d0e25Snbeams } 397*7d8d0e25Snbeams 398*7d8d0e25Snbeams //------------------------------------------------------------------------------ 399*7d8d0e25Snbeams // E-vector -> Q-vector, offests provided 400*7d8d0e25Snbeams //------------------------------------------------------------------------------ 401*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int Q1d> 402*7d8d0e25Snbeams inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 403*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) { 404*7d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 405*7d8d0e25Snbeams const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 406*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 407*7d8d0e25Snbeams r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 408*7d8d0e25Snbeams } 409*7d8d0e25Snbeams } 410*7d8d0e25Snbeams 411*7d8d0e25Snbeams //------------------------------------------------------------------------------ 412*7d8d0e25Snbeams // E-vector -> Q-vector, strided 413*7d8d0e25Snbeams //------------------------------------------------------------------------------ 414*7d8d0e25Snbeams template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 415*7d8d0e25Snbeams inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 416*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) { 417*7d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 418*7d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 419*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 420*7d8d0e25Snbeams r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 421*7d8d0e25Snbeams } 422*7d8d0e25Snbeams } 423*7d8d0e25Snbeams 424*7d8d0e25Snbeams //------------------------------------------------------------------------------ 425*7d8d0e25Snbeams // E-vector -> L-vector, offsets provided 426*7d8d0e25Snbeams //------------------------------------------------------------------------------ 427*7d8d0e25Snbeams template <int NCOMP, int COMPSTRIDE, int P1d> 428*7d8d0e25Snbeams inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 429*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 430*7d8d0e25Snbeams for (CeedInt z = 0; z < P1d; ++z) { 431*7d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 432*7d8d0e25Snbeams const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 433*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 434*7d8d0e25Snbeams atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 435*7d8d0e25Snbeams } 436*7d8d0e25Snbeams } 437*7d8d0e25Snbeams 438*7d8d0e25Snbeams //------------------------------------------------------------------------------ 439*7d8d0e25Snbeams // E-vector -> L-vector, strided 440*7d8d0e25Snbeams //------------------------------------------------------------------------------ 441*7d8d0e25Snbeams template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 442*7d8d0e25Snbeams inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 443*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 444*7d8d0e25Snbeams for (CeedInt z = 0; z < P1d; ++z) { 445*7d8d0e25Snbeams const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 446*7d8d0e25Snbeams const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 447*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) 448*7d8d0e25Snbeams d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 449*7d8d0e25Snbeams } 450*7d8d0e25Snbeams } 451*7d8d0e25Snbeams 452*7d8d0e25Snbeams //------------------------------------------------------------------------------ 453*7d8d0e25Snbeams // 3D tensor contract x 454*7d8d0e25Snbeams //------------------------------------------------------------------------------ 455*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 456*7d8d0e25Snbeams inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 457*7d8d0e25Snbeams CeedScalar r_B[P1d]; 458*7d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 459*7d8d0e25Snbeams r_B[i] = B[i + data.tidx*P1d]; 460*7d8d0e25Snbeams 461*7d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 462*7d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 463*7d8d0e25Snbeams __syncthreads(); 464*7d8d0e25Snbeams V[k] = 0.0; 465*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < P1d) 466*7d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 467*7d8d0e25Snbeams V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 468*7d8d0e25Snbeams __syncthreads(); 469*7d8d0e25Snbeams } 470*7d8d0e25Snbeams } 471*7d8d0e25Snbeams 472*7d8d0e25Snbeams //------------------------------------------------------------------------------ 473*7d8d0e25Snbeams // 3D tensor contract y 474*7d8d0e25Snbeams //------------------------------------------------------------------------------ 475*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 476*7d8d0e25Snbeams inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 477*7d8d0e25Snbeams CeedScalar r_B[P1d]; 478*7d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 479*7d8d0e25Snbeams r_B[i] = B[i + data.tidy*P1d]; 480*7d8d0e25Snbeams 481*7d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 482*7d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 483*7d8d0e25Snbeams __syncthreads(); 484*7d8d0e25Snbeams V[k] = 0.0; 485*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) 486*7d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 487*7d8d0e25Snbeams V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 488*7d8d0e25Snbeams __syncthreads(); 489*7d8d0e25Snbeams } 490*7d8d0e25Snbeams } 491*7d8d0e25Snbeams 492*7d8d0e25Snbeams //------------------------------------------------------------------------------ 493*7d8d0e25Snbeams // 3D tensor contract z 494*7d8d0e25Snbeams //------------------------------------------------------------------------------ 495*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 496*7d8d0e25Snbeams inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 497*7d8d0e25Snbeams for (CeedInt k = 0; k < Q1d; ++k) { 498*7d8d0e25Snbeams V[k] = 0.0; 499*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) 500*7d8d0e25Snbeams for (CeedInt i = 0; i < P1d; ++i) 501*7d8d0e25Snbeams V[k] += B[i + k*P1d] * U[i]; // Contract z direction 502*7d8d0e25Snbeams } 503*7d8d0e25Snbeams } 504*7d8d0e25Snbeams 505*7d8d0e25Snbeams //------------------------------------------------------------------------------ 506*7d8d0e25Snbeams // 3D transpose tensor contract z 507*7d8d0e25Snbeams //------------------------------------------------------------------------------ 508*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 509*7d8d0e25Snbeams inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 510*7d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 511*7d8d0e25Snbeams V[k] = 0.0; 512*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) 513*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 514*7d8d0e25Snbeams V[k] += B[k + i*P1d] * U[i]; // Contract z direction 515*7d8d0e25Snbeams } 516*7d8d0e25Snbeams } 517*7d8d0e25Snbeams 518*7d8d0e25Snbeams //------------------------------------------------------------------------------ 519*7d8d0e25Snbeams // 3D transpose tensor contract y 520*7d8d0e25Snbeams //------------------------------------------------------------------------------ 521*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 522*7d8d0e25Snbeams inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 523*7d8d0e25Snbeams CeedScalar r_B[Q1d]; 524*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 525*7d8d0e25Snbeams r_B[i] = B[data.tidy + i*P1d]; 526*7d8d0e25Snbeams 527*7d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 528*7d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 529*7d8d0e25Snbeams __syncthreads(); 530*7d8d0e25Snbeams V[k] = 0.0; 531*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < P1d) 532*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 533*7d8d0e25Snbeams V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 534*7d8d0e25Snbeams __syncthreads(); 535*7d8d0e25Snbeams } 536*7d8d0e25Snbeams } 537*7d8d0e25Snbeams 538*7d8d0e25Snbeams //------------------------------------------------------------------------------ 539*7d8d0e25Snbeams // 3D transpose tensor contract add y 540*7d8d0e25Snbeams //------------------------------------------------------------------------------ 541*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 542*7d8d0e25Snbeams inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 543*7d8d0e25Snbeams CeedScalar r_B[Q1d]; 544*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 545*7d8d0e25Snbeams r_B[i] = B[data.tidy + i*P1d]; 546*7d8d0e25Snbeams 547*7d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 548*7d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 549*7d8d0e25Snbeams __syncthreads(); 550*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < P1d) 551*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 552*7d8d0e25Snbeams V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 553*7d8d0e25Snbeams __syncthreads(); 554*7d8d0e25Snbeams } 555*7d8d0e25Snbeams } 556*7d8d0e25Snbeams 557*7d8d0e25Snbeams //------------------------------------------------------------------------------ 558*7d8d0e25Snbeams // 3D transpose tensor contract x 559*7d8d0e25Snbeams //------------------------------------------------------------------------------ 560*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 561*7d8d0e25Snbeams inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 562*7d8d0e25Snbeams CeedScalar r_B[Q1d]; 563*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 564*7d8d0e25Snbeams r_B[i] = B[data.tidx + i*P1d]; 565*7d8d0e25Snbeams 566*7d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 567*7d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 568*7d8d0e25Snbeams __syncthreads(); 569*7d8d0e25Snbeams V[k] = 0.0; 570*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 571*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 572*7d8d0e25Snbeams V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 573*7d8d0e25Snbeams __syncthreads(); 574*7d8d0e25Snbeams } 575*7d8d0e25Snbeams } 576*7d8d0e25Snbeams 577*7d8d0e25Snbeams //------------------------------------------------------------------------------ 578*7d8d0e25Snbeams // 3D transpose tensor contract add x 579*7d8d0e25Snbeams //------------------------------------------------------------------------------ 580*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 581*7d8d0e25Snbeams inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 582*7d8d0e25Snbeams CeedScalar r_B[Q1d]; 583*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 584*7d8d0e25Snbeams r_B[i] = B[data.tidx + i*P1d]; 585*7d8d0e25Snbeams 586*7d8d0e25Snbeams for (CeedInt k = 0; k < P1d; ++k) { 587*7d8d0e25Snbeams data.slice[data.tidx+data.tidy*T1d] = U[k]; 588*7d8d0e25Snbeams __syncthreads(); 589*7d8d0e25Snbeams if (data.tidx < P1d && data.tidy < P1d) 590*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 591*7d8d0e25Snbeams V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 592*7d8d0e25Snbeams __syncthreads(); 593*7d8d0e25Snbeams } 594*7d8d0e25Snbeams } 595*7d8d0e25Snbeams 596*7d8d0e25Snbeams //------------------------------------------------------------------------------ 597*7d8d0e25Snbeams // 3D interpolate to quadrature points 598*7d8d0e25Snbeams //------------------------------------------------------------------------------ 599*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 600*7d8d0e25Snbeams inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 601*7d8d0e25Snbeams CeedScalar r_t1[T1d]; 602*7d8d0e25Snbeams CeedScalar r_t2[T1d]; 603*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 604*7d8d0e25Snbeams ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 605*7d8d0e25Snbeams ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 606*7d8d0e25Snbeams ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 607*7d8d0e25Snbeams } 608*7d8d0e25Snbeams } 609*7d8d0e25Snbeams 610*7d8d0e25Snbeams //------------------------------------------------------------------------------ 611*7d8d0e25Snbeams // 3D interpolate transpose 612*7d8d0e25Snbeams //------------------------------------------------------------------------------ 613*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 614*7d8d0e25Snbeams inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 615*7d8d0e25Snbeams CeedScalar r_t1[T1d]; 616*7d8d0e25Snbeams CeedScalar r_t2[T1d]; 617*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 618*7d8d0e25Snbeams ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 619*7d8d0e25Snbeams ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 620*7d8d0e25Snbeams ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 621*7d8d0e25Snbeams } 622*7d8d0e25Snbeams } 623*7d8d0e25Snbeams 624*7d8d0e25Snbeams //------------------------------------------------------------------------------ 625*7d8d0e25Snbeams // 3D derivatives at quadrature points 626*7d8d0e25Snbeams //------------------------------------------------------------------------------ 627*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 628*7d8d0e25Snbeams inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 629*7d8d0e25Snbeams CeedScalar r_t1[T1d]; 630*7d8d0e25Snbeams CeedScalar r_t2[T1d]; 631*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 632*7d8d0e25Snbeams ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 633*7d8d0e25Snbeams ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 634*7d8d0e25Snbeams ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 635*7d8d0e25Snbeams ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 636*7d8d0e25Snbeams ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 637*7d8d0e25Snbeams ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 638*7d8d0e25Snbeams ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 639*7d8d0e25Snbeams ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 640*7d8d0e25Snbeams ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 641*7d8d0e25Snbeams } 642*7d8d0e25Snbeams } 643*7d8d0e25Snbeams 644*7d8d0e25Snbeams //------------------------------------------------------------------------------ 645*7d8d0e25Snbeams // 3D derivatives transpose 646*7d8d0e25Snbeams //------------------------------------------------------------------------------ 647*7d8d0e25Snbeams template <int NCOMP, int P1d, int Q1d> 648*7d8d0e25Snbeams inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 649*7d8d0e25Snbeams CeedScalar r_t1[T1d]; 650*7d8d0e25Snbeams CeedScalar r_t2[T1d]; 651*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; comp++) { 652*7d8d0e25Snbeams ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 653*7d8d0e25Snbeams ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 654*7d8d0e25Snbeams ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 655*7d8d0e25Snbeams ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 656*7d8d0e25Snbeams ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 657*7d8d0e25Snbeams ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 658*7d8d0e25Snbeams ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 659*7d8d0e25Snbeams ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 660*7d8d0e25Snbeams ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 661*7d8d0e25Snbeams } 662*7d8d0e25Snbeams } 663*7d8d0e25Snbeams 664*7d8d0e25Snbeams //------------------------------------------------------------------------------ 665*7d8d0e25Snbeams // 3D collocated derivatives computation 666*7d8d0e25Snbeams //------------------------------------------------------------------------------ 667*7d8d0e25Snbeams template <int NCOMP, int Q1d> 668*7d8d0e25Snbeams inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 669*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) { 670*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) { 671*7d8d0e25Snbeams data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 672*7d8d0e25Snbeams __syncthreads(); 673*7d8d0e25Snbeams // X derivative 674*7d8d0e25Snbeams r_V[comp+0*NCOMP] = 0.0; 675*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 676*7d8d0e25Snbeams r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 677*7d8d0e25Snbeams // Y derivative 678*7d8d0e25Snbeams r_V[comp+1*NCOMP] = 0.0; 679*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 680*7d8d0e25Snbeams r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 681*7d8d0e25Snbeams // Z derivative 682*7d8d0e25Snbeams r_V[comp+2*NCOMP] = 0.0; 683*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 684*7d8d0e25Snbeams r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 685*7d8d0e25Snbeams __syncthreads(); 686*7d8d0e25Snbeams } 687*7d8d0e25Snbeams } 688*7d8d0e25Snbeams } 689*7d8d0e25Snbeams 690*7d8d0e25Snbeams //------------------------------------------------------------------------------ 691*7d8d0e25Snbeams // 3D collocated derivatives transpose 692*7d8d0e25Snbeams //------------------------------------------------------------------------------ 693*7d8d0e25Snbeams template <int NCOMP, int Q1d> 694*7d8d0e25Snbeams inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 695*7d8d0e25Snbeams if (data.tidx < Q1d && data.tidy < Q1d) { 696*7d8d0e25Snbeams for (CeedInt comp = 0; comp < NCOMP; ++comp) { 697*7d8d0e25Snbeams // X derivative 698*7d8d0e25Snbeams data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 699*7d8d0e25Snbeams __syncthreads(); 700*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 701*7d8d0e25Snbeams r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 702*7d8d0e25Snbeams __syncthreads(); 703*7d8d0e25Snbeams // Y derivative 704*7d8d0e25Snbeams data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 705*7d8d0e25Snbeams __syncthreads(); 706*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 707*7d8d0e25Snbeams r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 708*7d8d0e25Snbeams __syncthreads(); 709*7d8d0e25Snbeams // Z derivative 710*7d8d0e25Snbeams for (CeedInt i = 0; i < Q1d; ++i) 711*7d8d0e25Snbeams r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 712*7d8d0e25Snbeams } 713*7d8d0e25Snbeams } 714*7d8d0e25Snbeams } 715*7d8d0e25Snbeams 716*7d8d0e25Snbeams //------------------------------------------------------------------------------ 717*7d8d0e25Snbeams // 1D quadrature weights 718*7d8d0e25Snbeams //------------------------------------------------------------------------------ 719*7d8d0e25Snbeams template <int Q1d> 720*7d8d0e25Snbeams inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 721*7d8d0e25Snbeams *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 722*7d8d0e25Snbeams } 723*7d8d0e25Snbeams 724*7d8d0e25Snbeams //------------------------------------------------------------------------------ 725*7d8d0e25Snbeams // 2D quadrature weights 726*7d8d0e25Snbeams //------------------------------------------------------------------------------ 727*7d8d0e25Snbeams template <int Q1d> 728*7d8d0e25Snbeams inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 729*7d8d0e25Snbeams *w = (data.tidx < Q1d && data.tidy < Q1d) ? 730*7d8d0e25Snbeams qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 731*7d8d0e25Snbeams } 732*7d8d0e25Snbeams 733*7d8d0e25Snbeams //------------------------------------------------------------------------------ 734*7d8d0e25Snbeams // 3D quadrature weights 735*7d8d0e25Snbeams //------------------------------------------------------------------------------ 736*7d8d0e25Snbeams template <int Q1d> 737*7d8d0e25Snbeams inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 738*7d8d0e25Snbeams const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 739*7d8d0e25Snbeams const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 740*7d8d0e25Snbeams for (CeedInt z = 0; z < Q1d; ++z) 741*7d8d0e25Snbeams w[z] = quad ? pw*qweight1d[z] : 0.0; 742*7d8d0e25Snbeams } 743*7d8d0e25Snbeams 744*7d8d0e25Snbeams ); 745*7d8d0e25Snbeams //------------------------------------------------------------------------------ 746*7d8d0e25Snbeams // Build singe operator kernel 747*7d8d0e25Snbeams //------------------------------------------------------------------------------ 748*7d8d0e25Snbeams extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { 749*7d8d0e25Snbeams 750*7d8d0e25Snbeams using std::ostringstream; 751*7d8d0e25Snbeams using std::string; 752*7d8d0e25Snbeams int ierr; 753*7d8d0e25Snbeams bool setupdone; 754*7d8d0e25Snbeams ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr); 755*7d8d0e25Snbeams if (setupdone) return 0; 756*7d8d0e25Snbeams Ceed ceed; 757*7d8d0e25Snbeams ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 758*7d8d0e25Snbeams CeedOperator_Hip_gen *data; 759*7d8d0e25Snbeams ierr = CeedOperatorGetData(op, &data); CeedChk(ierr); 760*7d8d0e25Snbeams CeedQFunction qf; 761*7d8d0e25Snbeams CeedQFunction_Hip_gen *qf_data; 762*7d8d0e25Snbeams ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 763*7d8d0e25Snbeams ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr); 764*7d8d0e25Snbeams CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 765*7d8d0e25Snbeams numoutputfields, ncomp, dim = 0, lsize; 766*7d8d0e25Snbeams ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 767*7d8d0e25Snbeams ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 768*7d8d0e25Snbeams ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 769*7d8d0e25Snbeams CeedChk(ierr); 770*7d8d0e25Snbeams CeedOperatorField *opinputfields, *opoutputfields; 771*7d8d0e25Snbeams ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 772*7d8d0e25Snbeams CeedChk(ierr); 773*7d8d0e25Snbeams CeedQFunctionField *qfinputfields, *qfoutputfields; 774*7d8d0e25Snbeams ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 775*7d8d0e25Snbeams CeedChk(ierr); 776*7d8d0e25Snbeams CeedEvalMode emode; 777*7d8d0e25Snbeams CeedBasis basis; 778*7d8d0e25Snbeams CeedBasis_Hip_shared *basis_data; 779*7d8d0e25Snbeams CeedElemRestriction Erestrict; 780*7d8d0e25Snbeams CeedElemRestriction_Hip *restr_data; 781*7d8d0e25Snbeams 782*7d8d0e25Snbeams ostringstream code; 783*7d8d0e25Snbeams string devFunctions(deviceFunctions); 784*7d8d0e25Snbeams 785*7d8d0e25Snbeams // Add atomicAdd function for old NVidia architectures 786*7d8d0e25Snbeams struct hipDeviceProp_t prop; 787*7d8d0e25Snbeams Ceed_Hip *ceed_data; 788*7d8d0e25Snbeams ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr); 789*7d8d0e25Snbeams ierr = hipGetDeviceProperties(&prop, ceed_data->deviceId); 790*7d8d0e25Snbeams if (prop.major<6){ 791*7d8d0e25Snbeams code << atomicAdd; 792*7d8d0e25Snbeams } 793*7d8d0e25Snbeams 794*7d8d0e25Snbeams code << devFunctions; 795*7d8d0e25Snbeams 796*7d8d0e25Snbeams string qFunction(qf_data->qFunctionSource); 797*7d8d0e25Snbeams string qFunctionName(qf_data->qFunctionName); 798*7d8d0e25Snbeams string oper; 799*7d8d0e25Snbeams oper = "CeedKernel_Hip_gen_" + qFunctionName; 800*7d8d0e25Snbeams 801*7d8d0e25Snbeams code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 802*7d8d0e25Snbeams code << "\n#define CeedPragmaSIMD\n"; 803*7d8d0e25Snbeams 804*7d8d0e25Snbeams // Find dim and Q1d 805*7d8d0e25Snbeams bool useCollograd = true; 806*7d8d0e25Snbeams data->maxP1d = 0; 807*7d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 808*7d8d0e25Snbeams ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 809*7d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 810*7d8d0e25Snbeams ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 811*7d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 812*7d8d0e25Snbeams CeedChk(ierr); 813*7d8d0e25Snbeams 814*7d8d0e25Snbeams // Check for collocated gradient 815*7d8d0e25Snbeams useCollograd = useCollograd && basis_data->d_collograd1d; 816*7d8d0e25Snbeams 817*7d8d0e25Snbeams // Collect dim and Q1d 818*7d8d0e25Snbeams ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 819*7d8d0e25Snbeams bool isTensor; 820*7d8d0e25Snbeams ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 821*7d8d0e25Snbeams if (isTensor) { 822*7d8d0e25Snbeams ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 823*7d8d0e25Snbeams ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 824*7d8d0e25Snbeams if (P1d>data->maxP1d) data->maxP1d = P1d; 825*7d8d0e25Snbeams } else { 826*7d8d0e25Snbeams // LCOV_EXCL_START 827*7d8d0e25Snbeams return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 828*7d8d0e25Snbeams // LCOV_EXCL_STOP 829*7d8d0e25Snbeams } 830*7d8d0e25Snbeams } 831*7d8d0e25Snbeams } 832*7d8d0e25Snbeams // Check output bases for Q1d, dim as well 833*7d8d0e25Snbeams // The only imput basis might be CEED_BASIS_COLLOCATED 834*7d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 835*7d8d0e25Snbeams ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 836*7d8d0e25Snbeams 837*7d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 838*7d8d0e25Snbeams ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 839*7d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 840*7d8d0e25Snbeams CeedChk(ierr); 841*7d8d0e25Snbeams 842*7d8d0e25Snbeams // Collect dim and Q1d 843*7d8d0e25Snbeams ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 844*7d8d0e25Snbeams bool isTensor; 845*7d8d0e25Snbeams ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 846*7d8d0e25Snbeams if (isTensor) { 847*7d8d0e25Snbeams ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 848*7d8d0e25Snbeams } else { 849*7d8d0e25Snbeams // LCOV_EXCL_START 850*7d8d0e25Snbeams return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 851*7d8d0e25Snbeams // LCOV_EXCL_STOP 852*7d8d0e25Snbeams } 853*7d8d0e25Snbeams 854*7d8d0e25Snbeams // Check for collocated gradient 855*7d8d0e25Snbeams useCollograd = useCollograd && basis_data->d_collograd1d; 856*7d8d0e25Snbeams } 857*7d8d0e25Snbeams } 858*7d8d0e25Snbeams data->dim = dim; 859*7d8d0e25Snbeams data->Q1d = Q1d; 860*7d8d0e25Snbeams 861*7d8d0e25Snbeams // Define CEED_Q_VLA 862*7d8d0e25Snbeams if (dim != 3 || useCollograd) { 863*7d8d0e25Snbeams code << "\n#define CEED_Q_VLA 1\n\n"; 864*7d8d0e25Snbeams } else { 865*7d8d0e25Snbeams code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 866*7d8d0e25Snbeams } 867*7d8d0e25Snbeams 868*7d8d0e25Snbeams code << qFunction; 869*7d8d0e25Snbeams 870*7d8d0e25Snbeams // Setup 871*7d8d0e25Snbeams code << "\n// -----------------------------------------------------------------------------\n"; 872*7d8d0e25Snbeams code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, HipFieldsInt indices, HipFields fields, HipFields B, HipFields G, CeedScalar* W) {\n"; 873*7d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 874*7d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 875*7d8d0e25Snbeams CeedChk(ierr); 876*7d8d0e25Snbeams if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 877*7d8d0e25Snbeams code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 878*7d8d0e25Snbeams } 879*7d8d0e25Snbeams } 880*7d8d0e25Snbeams 881*7d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 882*7d8d0e25Snbeams code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 883*7d8d0e25Snbeams } 884*7d8d0e25Snbeams 885*7d8d0e25Snbeams code << " const CeedInt Dim = "<<dim<<";\n"; 886*7d8d0e25Snbeams code << " const CeedInt Q1d = "<<Q1d<<";\n"; 887*7d8d0e25Snbeams 888*7d8d0e25Snbeams code << " HIP_DYNAMIC_SHARED( CeedScalar, slice)\n"; 889*7d8d0e25Snbeams code << " BackendData data;\n"; 890*7d8d0e25Snbeams code << " data.tidx = threadIdx.x;\n"; 891*7d8d0e25Snbeams code << " data.tidy = threadIdx.y;\n"; 892*7d8d0e25Snbeams code << " data.tidz = threadIdx.z;\n"; 893*7d8d0e25Snbeams code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 894*7d8d0e25Snbeams code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 895*7d8d0e25Snbeams 896*7d8d0e25Snbeams code << "\n // -- Input field constants and basis data --\n"; 897*7d8d0e25Snbeams //Initialize constants, and matrices B and G 898*7d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 899*7d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 900*7d8d0e25Snbeams // Get elemsize, emode, ncomp 901*7d8d0e25Snbeams ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 902*7d8d0e25Snbeams CeedChk(ierr); 903*7d8d0e25Snbeams ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 904*7d8d0e25Snbeams CeedChk(ierr); 905*7d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 906*7d8d0e25Snbeams CeedChk(ierr); 907*7d8d0e25Snbeams ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 908*7d8d0e25Snbeams CeedChk(ierr); 909*7d8d0e25Snbeams 910*7d8d0e25Snbeams // Set field constants 911*7d8d0e25Snbeams if (emode != CEED_EVAL_WEIGHT) { 912*7d8d0e25Snbeams ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 913*7d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 914*7d8d0e25Snbeams ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 915*7d8d0e25Snbeams code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 916*7d8d0e25Snbeams } else { 917*7d8d0e25Snbeams code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 918*7d8d0e25Snbeams } 919*7d8d0e25Snbeams code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 920*7d8d0e25Snbeams } 921*7d8d0e25Snbeams 922*7d8d0e25Snbeams // Load basis data 923*7d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 924*7d8d0e25Snbeams switch (emode) { 925*7d8d0e25Snbeams case CEED_EVAL_NONE: 926*7d8d0e25Snbeams break; 927*7d8d0e25Snbeams case CEED_EVAL_INTERP: 928*7d8d0e25Snbeams ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 929*7d8d0e25Snbeams data->B.in[i] = basis_data->d_interp1d; 930*7d8d0e25Snbeams code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 931*7d8d0e25Snbeams code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 932*7d8d0e25Snbeams break; 933*7d8d0e25Snbeams case CEED_EVAL_GRAD: 934*7d8d0e25Snbeams ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 935*7d8d0e25Snbeams data->B.in[i] = basis_data->d_interp1d; 936*7d8d0e25Snbeams code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 937*7d8d0e25Snbeams code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 938*7d8d0e25Snbeams if (useCollograd) { 939*7d8d0e25Snbeams data->G.in[i] = basis_data->d_collograd1d; 940*7d8d0e25Snbeams code << " __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 941*7d8d0e25Snbeams code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 942*7d8d0e25Snbeams } else { 943*7d8d0e25Snbeams data->G.in[i] = basis_data->d_grad1d; 944*7d8d0e25Snbeams code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 945*7d8d0e25Snbeams code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 946*7d8d0e25Snbeams } 947*7d8d0e25Snbeams break; 948*7d8d0e25Snbeams case CEED_EVAL_WEIGHT: 949*7d8d0e25Snbeams break; // No action 950*7d8d0e25Snbeams case CEED_EVAL_DIV: 951*7d8d0e25Snbeams break; // TODO: Not implemented 952*7d8d0e25Snbeams case CEED_EVAL_CURL: 953*7d8d0e25Snbeams break; // TODO: Not implemented 954*7d8d0e25Snbeams } 955*7d8d0e25Snbeams } 956*7d8d0e25Snbeams 957*7d8d0e25Snbeams code << "\n // -- Output field constants and basis data --\n"; 958*7d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 959*7d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 960*7d8d0e25Snbeams // Get elemsize, emode, ncomp 961*7d8d0e25Snbeams ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 962*7d8d0e25Snbeams CeedChk(ierr); 963*7d8d0e25Snbeams ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 964*7d8d0e25Snbeams CeedChk(ierr); 965*7d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 966*7d8d0e25Snbeams CeedChk(ierr); 967*7d8d0e25Snbeams ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 968*7d8d0e25Snbeams CeedChk(ierr); 969*7d8d0e25Snbeams 970*7d8d0e25Snbeams // Set field constants 971*7d8d0e25Snbeams ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 972*7d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 973*7d8d0e25Snbeams ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 974*7d8d0e25Snbeams code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 975*7d8d0e25Snbeams } else { 976*7d8d0e25Snbeams code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 977*7d8d0e25Snbeams } 978*7d8d0e25Snbeams code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 979*7d8d0e25Snbeams 980*7d8d0e25Snbeams // Load basis data 981*7d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 982*7d8d0e25Snbeams switch (emode) { 983*7d8d0e25Snbeams case CEED_EVAL_NONE: 984*7d8d0e25Snbeams break; // No action 985*7d8d0e25Snbeams case CEED_EVAL_INTERP: 986*7d8d0e25Snbeams ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 987*7d8d0e25Snbeams data->B.out[i] = basis_data->d_interp1d; 988*7d8d0e25Snbeams code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 989*7d8d0e25Snbeams code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 990*7d8d0e25Snbeams break; 991*7d8d0e25Snbeams case CEED_EVAL_GRAD: 992*7d8d0e25Snbeams ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 993*7d8d0e25Snbeams data->B.out[i] = basis_data->d_interp1d; 994*7d8d0e25Snbeams code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 995*7d8d0e25Snbeams code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 996*7d8d0e25Snbeams if (useCollograd) { 997*7d8d0e25Snbeams data->G.out[i] = basis_data->d_collograd1d; 998*7d8d0e25Snbeams code << " __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 999*7d8d0e25Snbeams code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1000*7d8d0e25Snbeams } else { 1001*7d8d0e25Snbeams data->G.out[i] = basis_data->d_grad1d; 1002*7d8d0e25Snbeams code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1003*7d8d0e25Snbeams code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1004*7d8d0e25Snbeams } 1005*7d8d0e25Snbeams break; 1006*7d8d0e25Snbeams // LCOV_EXCL_START 1007*7d8d0e25Snbeams case CEED_EVAL_WEIGHT: { 1008*7d8d0e25Snbeams Ceed ceed; 1009*7d8d0e25Snbeams ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1010*7d8d0e25Snbeams return CeedError(ceed, 1, 1011*7d8d0e25Snbeams "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1012*7d8d0e25Snbeams break; // Should not occur 1013*7d8d0e25Snbeams } 1014*7d8d0e25Snbeams case CEED_EVAL_DIV: 1015*7d8d0e25Snbeams break; // TODO: Not implemented 1016*7d8d0e25Snbeams case CEED_EVAL_CURL: 1017*7d8d0e25Snbeams break; // TODO: Not implemented 1018*7d8d0e25Snbeams // LCOV_EXCL_STOP 1019*7d8d0e25Snbeams } 1020*7d8d0e25Snbeams } 1021*7d8d0e25Snbeams code << "\n // -- Element loop --\n"; 1022*7d8d0e25Snbeams code << " __syncthreads();\n"; 1023*7d8d0e25Snbeams code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1024*7d8d0e25Snbeams // Input basis apply if needed 1025*7d8d0e25Snbeams // Generate the correct eval mode code for each input 1026*7d8d0e25Snbeams code << " // -- Input field restrictions and basis actions --\n"; 1027*7d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 1028*7d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 1029*7d8d0e25Snbeams // Get elemsize, emode, ncomp 1030*7d8d0e25Snbeams ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1031*7d8d0e25Snbeams CeedChk(ierr); 1032*7d8d0e25Snbeams ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1033*7d8d0e25Snbeams CeedChk(ierr); 1034*7d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1035*7d8d0e25Snbeams CeedChk(ierr); 1036*7d8d0e25Snbeams ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1037*7d8d0e25Snbeams CeedChk(ierr); 1038*7d8d0e25Snbeams 1039*7d8d0e25Snbeams // Restriction 1040*7d8d0e25Snbeams if (emode != CEED_EVAL_WEIGHT && 1041*7d8d0e25Snbeams !((emode == CEED_EVAL_NONE) && useCollograd)) { 1042*7d8d0e25Snbeams code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1043*7d8d0e25Snbeams 1044*7d8d0e25Snbeams bool isStrided; 1045*7d8d0e25Snbeams ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1046*7d8d0e25Snbeams if (!isStrided) { 1047*7d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1048*7d8d0e25Snbeams CeedChk(ierr); 1049*7d8d0e25Snbeams code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1050*7d8d0e25Snbeams CeedInt compstride; 1051*7d8d0e25Snbeams ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1052*7d8d0e25Snbeams code << " // CompStride: "<<compstride<<"\n"; 1053*7d8d0e25Snbeams ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1054*7d8d0e25Snbeams data->indices.in[i] = restr_data->d_ind; 1055*7d8d0e25Snbeams code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1056*7d8d0e25Snbeams } else { 1057*7d8d0e25Snbeams bool backendstrides; 1058*7d8d0e25Snbeams ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1059*7d8d0e25Snbeams CeedChk(ierr); 1060*7d8d0e25Snbeams CeedInt nelem; 1061*7d8d0e25Snbeams ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1062*7d8d0e25Snbeams CeedChk(ierr); 1063*7d8d0e25Snbeams CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1064*7d8d0e25Snbeams if (!backendstrides) { 1065*7d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1066*7d8d0e25Snbeams CeedChk(ierr); 1067*7d8d0e25Snbeams } 1068*7d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1069*7d8d0e25Snbeams code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1070*7d8d0e25Snbeams } 1071*7d8d0e25Snbeams } 1072*7d8d0e25Snbeams 1073*7d8d0e25Snbeams // Basis action 1074*7d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1075*7d8d0e25Snbeams switch (emode) { 1076*7d8d0e25Snbeams case CEED_EVAL_NONE: 1077*7d8d0e25Snbeams if (!useCollograd) { 1078*7d8d0e25Snbeams code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1079*7d8d0e25Snbeams } 1080*7d8d0e25Snbeams break; 1081*7d8d0e25Snbeams case CEED_EVAL_INTERP: 1082*7d8d0e25Snbeams code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1083*7d8d0e25Snbeams code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1084*7d8d0e25Snbeams break; 1085*7d8d0e25Snbeams case CEED_EVAL_GRAD: 1086*7d8d0e25Snbeams if (useCollograd) { 1087*7d8d0e25Snbeams code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1088*7d8d0e25Snbeams code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1089*7d8d0e25Snbeams } else { 1090*7d8d0e25Snbeams code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1091*7d8d0e25Snbeams code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1092*7d8d0e25Snbeams } 1093*7d8d0e25Snbeams break; 1094*7d8d0e25Snbeams case CEED_EVAL_WEIGHT: 1095*7d8d0e25Snbeams code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1096*7d8d0e25Snbeams ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1097*7d8d0e25Snbeams ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 1098*7d8d0e25Snbeams data->W = basis_data->d_qweight1d; 1099*7d8d0e25Snbeams code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1100*7d8d0e25Snbeams break; // No action 1101*7d8d0e25Snbeams case CEED_EVAL_DIV: 1102*7d8d0e25Snbeams break; // TODO: Not implemented 1103*7d8d0e25Snbeams case CEED_EVAL_CURL: 1104*7d8d0e25Snbeams break; // TODO: Not implemented 1105*7d8d0e25Snbeams } 1106*7d8d0e25Snbeams } 1107*7d8d0e25Snbeams 1108*7d8d0e25Snbeams // Q function 1109*7d8d0e25Snbeams code << "\n // -- Output field setup --\n"; 1110*7d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 1111*7d8d0e25Snbeams code << "\n // ---- Output field "<<i<<" ----\n"; 1112*7d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1113*7d8d0e25Snbeams CeedChk(ierr); 1114*7d8d0e25Snbeams if (emode==CEED_EVAL_GRAD) 1115*7d8d0e25Snbeams { 1116*7d8d0e25Snbeams if (useCollograd) { 1117*7d8d0e25Snbeams //Accumulator for gradient slices 1118*7d8d0e25Snbeams code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1119*7d8d0e25Snbeams code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1120*7d8d0e25Snbeams code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1121*7d8d0e25Snbeams code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1122*7d8d0e25Snbeams code << " }\n"; 1123*7d8d0e25Snbeams code << " }\n"; 1124*7d8d0e25Snbeams } else { 1125*7d8d0e25Snbeams code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1126*7d8d0e25Snbeams } 1127*7d8d0e25Snbeams } 1128*7d8d0e25Snbeams if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1129*7d8d0e25Snbeams { 1130*7d8d0e25Snbeams code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1131*7d8d0e25Snbeams } 1132*7d8d0e25Snbeams } 1133*7d8d0e25Snbeams // We treat quadrature points per slice in 3d to save registers 1134*7d8d0e25Snbeams if (useCollograd) { 1135*7d8d0e25Snbeams code << "\n // Note: Collocated Gradient\n"; 1136*7d8d0e25Snbeams code << "#pragma unroll\n"; 1137*7d8d0e25Snbeams code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1138*7d8d0e25Snbeams code << " // -- Input fields --\n"; 1139*7d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 1140*7d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 1141*7d8d0e25Snbeams // Get elemsize, emode, ncomp 1142*7d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1143*7d8d0e25Snbeams CeedChk(ierr); 1144*7d8d0e25Snbeams // Basis action 1145*7d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1146*7d8d0e25Snbeams switch (emode) { 1147*7d8d0e25Snbeams case CEED_EVAL_NONE: 1148*7d8d0e25Snbeams code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1149*7d8d0e25Snbeams 1150*7d8d0e25Snbeams bool isStrided; 1151*7d8d0e25Snbeams ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr); 1152*7d8d0e25Snbeams ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr); 1153*7d8d0e25Snbeams ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1154*7d8d0e25Snbeams if (!isStrided) { 1155*7d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1156*7d8d0e25Snbeams CeedChk(ierr); 1157*7d8d0e25Snbeams code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1158*7d8d0e25Snbeams CeedInt compstride; 1159*7d8d0e25Snbeams ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1160*7d8d0e25Snbeams code << " // CompStride: "<<compstride<<"\n"; 1161*7d8d0e25Snbeams ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1162*7d8d0e25Snbeams data->indices.in[i] = restr_data->d_ind; 1163*7d8d0e25Snbeams code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1164*7d8d0e25Snbeams } else { 1165*7d8d0e25Snbeams bool backendstrides; 1166*7d8d0e25Snbeams ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1167*7d8d0e25Snbeams CeedChk(ierr); 1168*7d8d0e25Snbeams CeedInt nelem; 1169*7d8d0e25Snbeams ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1170*7d8d0e25Snbeams CeedChk(ierr); 1171*7d8d0e25Snbeams CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1172*7d8d0e25Snbeams if (!backendstrides) { 1173*7d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1174*7d8d0e25Snbeams CeedChk(ierr); 1175*7d8d0e25Snbeams } 1176*7d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1177*7d8d0e25Snbeams code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1178*7d8d0e25Snbeams } 1179*7d8d0e25Snbeams break; 1180*7d8d0e25Snbeams case CEED_EVAL_INTERP: 1181*7d8d0e25Snbeams code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1182*7d8d0e25Snbeams code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1183*7d8d0e25Snbeams code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1184*7d8d0e25Snbeams code << " }\n"; 1185*7d8d0e25Snbeams break; 1186*7d8d0e25Snbeams case CEED_EVAL_GRAD: 1187*7d8d0e25Snbeams code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1188*7d8d0e25Snbeams code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1189*7d8d0e25Snbeams break; 1190*7d8d0e25Snbeams case CEED_EVAL_WEIGHT: 1191*7d8d0e25Snbeams code << " CeedScalar r_q"<<i<<"[1];\n"; 1192*7d8d0e25Snbeams code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1193*7d8d0e25Snbeams break; // No action 1194*7d8d0e25Snbeams case CEED_EVAL_DIV: 1195*7d8d0e25Snbeams break; // TODO: Not implemented 1196*7d8d0e25Snbeams case CEED_EVAL_CURL: 1197*7d8d0e25Snbeams break; // TODO: Not implemented 1198*7d8d0e25Snbeams } 1199*7d8d0e25Snbeams } 1200*7d8d0e25Snbeams code << "\n // -- Output fields --\n"; 1201*7d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 1202*7d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 1203*7d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1204*7d8d0e25Snbeams CeedChk(ierr); 1205*7d8d0e25Snbeams // Basis action 1206*7d8d0e25Snbeams switch (emode) { 1207*7d8d0e25Snbeams case CEED_EVAL_NONE: 1208*7d8d0e25Snbeams code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1209*7d8d0e25Snbeams break; // No action 1210*7d8d0e25Snbeams case CEED_EVAL_INTERP: 1211*7d8d0e25Snbeams code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1212*7d8d0e25Snbeams break; 1213*7d8d0e25Snbeams case CEED_EVAL_GRAD: 1214*7d8d0e25Snbeams code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1215*7d8d0e25Snbeams break; 1216*7d8d0e25Snbeams case CEED_EVAL_WEIGHT: 1217*7d8d0e25Snbeams break; // Should not occur 1218*7d8d0e25Snbeams case CEED_EVAL_DIV: 1219*7d8d0e25Snbeams break; // TODO: Not implemented 1220*7d8d0e25Snbeams case CEED_EVAL_CURL: 1221*7d8d0e25Snbeams break; // TODO: Not implemented 1222*7d8d0e25Snbeams } 1223*7d8d0e25Snbeams } 1224*7d8d0e25Snbeams } else { 1225*7d8d0e25Snbeams code << "\n // Note: No Collocated Gradient\n"; 1226*7d8d0e25Snbeams code << " // -- Input fields --\n"; 1227*7d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 1228*7d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 1229*7d8d0e25Snbeams code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1230*7d8d0e25Snbeams } 1231*7d8d0e25Snbeams code << " // -- Output fields --\n"; 1232*7d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 1233*7d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 1234*7d8d0e25Snbeams code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1235*7d8d0e25Snbeams } 1236*7d8d0e25Snbeams } 1237*7d8d0e25Snbeams code << "\n // -- QFunction Inputs and outputs --\n"; 1238*7d8d0e25Snbeams code << " CeedScalar* in["<<numinputfields<<"];\n"; 1239*7d8d0e25Snbeams for (CeedInt i = 0; i < numinputfields; i++) { 1240*7d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 1241*7d8d0e25Snbeams code << " in["<<i<<"] = r_q"<<i<<";\n"; 1242*7d8d0e25Snbeams } 1243*7d8d0e25Snbeams code << " CeedScalar* out["<<numoutputfields<<"];\n"; 1244*7d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 1245*7d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 1246*7d8d0e25Snbeams code << " out["<<i<<"] = r_qq"<<i<<";\n"; 1247*7d8d0e25Snbeams } 1248*7d8d0e25Snbeams code << "\n // -- Apply QFunction --\n"; 1249*7d8d0e25Snbeams code << " "<<qFunctionName<<"(ctx, "; 1250*7d8d0e25Snbeams if (dim != 3 || useCollograd) { 1251*7d8d0e25Snbeams code << "1"; 1252*7d8d0e25Snbeams } else { 1253*7d8d0e25Snbeams code << "Q1d"; 1254*7d8d0e25Snbeams } 1255*7d8d0e25Snbeams code << ", in, out);\n"; 1256*7d8d0e25Snbeams if (useCollograd) { 1257*7d8d0e25Snbeams code << "\n // Note: Collocated Gradient\n"; 1258*7d8d0e25Snbeams code << " // -- Output fields --\n"; 1259*7d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 1260*7d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 1261*7d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1262*7d8d0e25Snbeams CeedChk(ierr); 1263*7d8d0e25Snbeams // Basis action 1264*7d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1265*7d8d0e25Snbeams switch (emode) { 1266*7d8d0e25Snbeams case CEED_EVAL_NONE: 1267*7d8d0e25Snbeams code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1268*7d8d0e25Snbeams code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1269*7d8d0e25Snbeams code << " }\n"; 1270*7d8d0e25Snbeams break; // No action 1271*7d8d0e25Snbeams case CEED_EVAL_INTERP: 1272*7d8d0e25Snbeams code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1273*7d8d0e25Snbeams code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1274*7d8d0e25Snbeams code << " }\n"; 1275*7d8d0e25Snbeams break; 1276*7d8d0e25Snbeams case CEED_EVAL_GRAD: 1277*7d8d0e25Snbeams code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1278*7d8d0e25Snbeams break; 1279*7d8d0e25Snbeams case CEED_EVAL_WEIGHT: 1280*7d8d0e25Snbeams break; // Should not occur 1281*7d8d0e25Snbeams case CEED_EVAL_DIV: 1282*7d8d0e25Snbeams break; // TODO: Not implemented 1283*7d8d0e25Snbeams case CEED_EVAL_CURL: 1284*7d8d0e25Snbeams break; // TODO: Not implemented 1285*7d8d0e25Snbeams } 1286*7d8d0e25Snbeams } 1287*7d8d0e25Snbeams code << " }\n"; 1288*7d8d0e25Snbeams } 1289*7d8d0e25Snbeams 1290*7d8d0e25Snbeams // Output basis apply if needed 1291*7d8d0e25Snbeams // Generate the correct eval mode code for each output 1292*7d8d0e25Snbeams code << "\n // -- Output field basis action and restrictions --\n"; 1293*7d8d0e25Snbeams for (CeedInt i = 0; i < numoutputfields; i++) { 1294*7d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 1295*7d8d0e25Snbeams // Get elemsize, emode, ncomp 1296*7d8d0e25Snbeams ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1297*7d8d0e25Snbeams CeedChk(ierr); 1298*7d8d0e25Snbeams ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1299*7d8d0e25Snbeams CeedChk(ierr); 1300*7d8d0e25Snbeams ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1301*7d8d0e25Snbeams CeedChk(ierr); 1302*7d8d0e25Snbeams ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1303*7d8d0e25Snbeams CeedChk(ierr); 1304*7d8d0e25Snbeams // Basis action 1305*7d8d0e25Snbeams code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1306*7d8d0e25Snbeams switch (emode) { 1307*7d8d0e25Snbeams case CEED_EVAL_NONE: 1308*7d8d0e25Snbeams code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1309*7d8d0e25Snbeams break; // No action 1310*7d8d0e25Snbeams case CEED_EVAL_INTERP: 1311*7d8d0e25Snbeams code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1312*7d8d0e25Snbeams code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1313*7d8d0e25Snbeams break; 1314*7d8d0e25Snbeams case CEED_EVAL_GRAD: 1315*7d8d0e25Snbeams code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1316*7d8d0e25Snbeams if (useCollograd) { 1317*7d8d0e25Snbeams code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1318*7d8d0e25Snbeams } else { 1319*7d8d0e25Snbeams code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1320*7d8d0e25Snbeams } 1321*7d8d0e25Snbeams break; 1322*7d8d0e25Snbeams // LCOV_EXCL_START 1323*7d8d0e25Snbeams case CEED_EVAL_WEIGHT: { 1324*7d8d0e25Snbeams Ceed ceed; 1325*7d8d0e25Snbeams ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1326*7d8d0e25Snbeams return CeedError(ceed, 1, 1327*7d8d0e25Snbeams "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1328*7d8d0e25Snbeams break; // Should not occur 1329*7d8d0e25Snbeams } 1330*7d8d0e25Snbeams case CEED_EVAL_DIV: 1331*7d8d0e25Snbeams break; // TODO: Not implemented 1332*7d8d0e25Snbeams case CEED_EVAL_CURL: 1333*7d8d0e25Snbeams break; // TODO: Not implemented 1334*7d8d0e25Snbeams // LCOV_EXCL_STOP 1335*7d8d0e25Snbeams } 1336*7d8d0e25Snbeams // Restriction 1337*7d8d0e25Snbeams bool isStrided; 1338*7d8d0e25Snbeams ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 1339*7d8d0e25Snbeams if (!isStrided) { 1340*7d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1341*7d8d0e25Snbeams CeedChk(ierr); 1342*7d8d0e25Snbeams code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1343*7d8d0e25Snbeams CeedInt compstride; 1344*7d8d0e25Snbeams ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1345*7d8d0e25Snbeams code << " // CompStride: "<<compstride<<"\n"; 1346*7d8d0e25Snbeams ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 1347*7d8d0e25Snbeams data->indices.out[i] = restr_data->d_ind; 1348*7d8d0e25Snbeams code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1349*7d8d0e25Snbeams } else { 1350*7d8d0e25Snbeams bool backendstrides; 1351*7d8d0e25Snbeams ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1352*7d8d0e25Snbeams CeedChk(ierr); 1353*7d8d0e25Snbeams CeedInt nelem; 1354*7d8d0e25Snbeams ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1355*7d8d0e25Snbeams CeedChk(ierr); 1356*7d8d0e25Snbeams CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1357*7d8d0e25Snbeams if (!backendstrides) { 1358*7d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1359*7d8d0e25Snbeams CeedChk(ierr); 1360*7d8d0e25Snbeams } 1361*7d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1362*7d8d0e25Snbeams code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1363*7d8d0e25Snbeams } 1364*7d8d0e25Snbeams } 1365*7d8d0e25Snbeams 1366*7d8d0e25Snbeams code << " }\n"; 1367*7d8d0e25Snbeams code << "}\n"; 1368*7d8d0e25Snbeams code << "// -----------------------------------------------------------------------------\n\n"; 1369*7d8d0e25Snbeams 1370*7d8d0e25Snbeams // View kernel for debugging 1371*7d8d0e25Snbeams CeedDebug(code.str().c_str()); 1372*7d8d0e25Snbeams 1373*7d8d0e25Snbeams ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 1, 1374*7d8d0e25Snbeams "T1d", CeedIntMax(Q1d, data->maxP1d)); 1375*7d8d0e25Snbeams CeedChk(ierr); 1376*7d8d0e25Snbeams ierr = CeedGetKernelHip(ceed, data->module, oper.c_str(), &data->op); 1377*7d8d0e25Snbeams CeedChk(ierr); 1378*7d8d0e25Snbeams 1379*7d8d0e25Snbeams ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1380*7d8d0e25Snbeams return 0; 1381*7d8d0e25Snbeams } 1382*7d8d0e25Snbeams //------------------------------------------------------------------------------ 1383