1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12 177df94212SJeremy L Thompson 18241a4b83SYohann #include "ceed-cuda-gen.h" 19241a4b83SYohann #include <iostream> 20241a4b83SYohann #include <sstream> 21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h" 22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 23241a4b83SYohann 24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 25ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 26ab213215SJeremy L Thompson // Atomic add, for older CUDA 27ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 28241a4b83SYohann __device__ double atomicAdd(double *address, double val) { 29241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 30241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 31241a4b83SYohann do { 32241a4b83SYohann assumed = old; 33241a4b83SYohann old = 34241a4b83SYohann atomicCAS(address_as_ull, assumed, 35241a4b83SYohann __double_as_longlong(val + 36241a4b83SYohann __longlong_as_double(assumed))); 37241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 38241a4b83SYohann // (since NaN != NaN) 39241a4b83SYohann } while (assumed != old); 40241a4b83SYohann return __longlong_as_double(old); 41241a4b83SYohann } 42f1a13f77SYohann Dudouit ); 43f1a13f77SYohann Dudouit 44f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 45f1a13f77SYohann Dudouit 46ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 47ab213215SJeremy L Thompson // Typedefs 48ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 49f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 50f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 51f1a13f77SYohann Dudouit 52f1a13f77SYohann Dudouit typedef struct { 53f1a13f77SYohann Dudouit CeedInt tidx; 54f1a13f77SYohann Dudouit CeedInt tidy; 55f1a13f77SYohann Dudouit CeedInt tidz; 56f1a13f77SYohann Dudouit CeedInt tid; 57f1a13f77SYohann Dudouit CeedScalar* slice; 58f1a13f77SYohann Dudouit } BackendData; 59241a4b83SYohann 60ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 61ab213215SJeremy L Thompson // Load matrices for basis actions 62ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 63241a4b83SYohann template <int P, int Q> 64241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 65ab213215SJeremy L Thompson for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 66241a4b83SYohann B[i] = d_B[i]; 67241a4b83SYohann } 68241a4b83SYohann 69ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 70241a4b83SYohann // 1D 71ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 72ab213215SJeremy L Thompson 73ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 74ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 75ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 765c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 775c7b696cSjeremylt inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 78ab213215SJeremy L Thompson if (data.tidx < P1d) { 794d537eeaSYohann const CeedInt node = data.tidx; 80ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 81ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 825c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 83241a4b83SYohann } 84241a4b83SYohann } 85920dcdc4Sjeremylt 86ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 87ab213215SJeremy L Thompson // L-vector -> E-vector, strided 88ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 89d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 90d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 91ab213215SJeremy L Thompson if (data.tidx < P1d) { 92ccedf6b0Sjeremylt const CeedInt node = data.tidx; 93d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 94ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 95d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 96ccedf6b0Sjeremylt } 97ccedf6b0Sjeremylt } 98241a4b83SYohann 99ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 100ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 101ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1025c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 1035c7b696cSjeremylt inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 104ab213215SJeremy L Thompson if (data.tidx < P1d) { 1054d537eeaSYohann const CeedInt node = data.tidx; 106ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 107ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 1085c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 109241a4b83SYohann } 110241a4b83SYohann } 111241a4b83SYohann 112ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 113ab213215SJeremy L Thompson // E-vector -> L-vector, strided 114ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 115d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 116d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 117ab213215SJeremy L Thompson if (data.tidx < P1d) { 118ccedf6b0Sjeremylt const CeedInt node = data.tidx; 119d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 120ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 121d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 122ccedf6b0Sjeremylt } 123ccedf6b0Sjeremylt } 124ccedf6b0Sjeremylt 125ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 126ab213215SJeremy L Thompson // 1D tensor contraction x 127ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 128241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 129ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 130241a4b83SYohann data.slice[data.tidx] = *U; 131241a4b83SYohann __syncthreads(); 132241a4b83SYohann *V = 0.0; 133*18d499f1SYohann if (data.tidx < Q1d) 134ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 135ab213215SJeremy L Thompson *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 136241a4b83SYohann __syncthreads(); 137241a4b83SYohann } 138241a4b83SYohann 139ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 140ab213215SJeremy L Thompson // 1D transpose tensor contraction x 141ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 142241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 143ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 144241a4b83SYohann data.slice[data.tidx] = *U; 145241a4b83SYohann __syncthreads(); 146241a4b83SYohann *V = 0.0; 147*18d499f1SYohann if (data.tidx < P1d) 148ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 149ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 150241a4b83SYohann __syncthreads(); 151241a4b83SYohann } 152241a4b83SYohann 153ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 154ab213215SJeremy L Thompson // 1D interpolate to quadrature points 155ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 156241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 157ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 158ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 159241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 160241a4b83SYohann } 161241a4b83SYohann 162ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 163ab213215SJeremy L Thompson // 1D interpolate transpose 164ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 165241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 166ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 167ab213215SJeremy L Thompson for (CeedInt comp=0; comp<NCOMP; comp++) 168241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 169241a4b83SYohann } 170241a4b83SYohann 171ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 172ab213215SJeremy L Thompson // 1D derivatives at quadrature points 173ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 174241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 175ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 176ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 177241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 178241a4b83SYohann } 179241a4b83SYohann 180ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 181ab213215SJeremy L Thompson // 1D derivatives transpose 182ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 183241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 184ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 185ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 186241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 187241a4b83SYohann } 188241a4b83SYohann 189ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 190241a4b83SYohann // 2D 191ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 192ab213215SJeremy L Thompson 193ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 194ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 195ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1965c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 1975c7b696cSjeremylt inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 198ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 1994d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 200ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 201ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2025c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 203241a4b83SYohann } 204241a4b83SYohann } 205920dcdc4Sjeremylt 206ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 207ab213215SJeremy L Thompson // L-vector -> E-vector, strided 208ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 209d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 210d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 211ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 212ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 213d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 214ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 215d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 216ccedf6b0Sjeremylt } 217ccedf6b0Sjeremylt } 218241a4b83SYohann 219ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 220ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 221ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 2225c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 2235c7b696cSjeremylt inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 224ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 2254d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 226ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 227ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2285c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 229241a4b83SYohann } 230241a4b83SYohann } 231241a4b83SYohann 232ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 233ab213215SJeremy L Thompson // E-vector -> L-vector, strided 234ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 235d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 236d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 237ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 238ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 239d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 240ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 241d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 242ccedf6b0Sjeremylt } 243ccedf6b0Sjeremylt } 244ccedf6b0Sjeremylt 245ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 246ab213215SJeremy L Thompson // 2D tensor contraction x 247ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 248241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 249ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 250*18d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 251241a4b83SYohann __syncthreads(); 252241a4b83SYohann *V = 0.0; 253*18d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 254ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 255*18d499f1SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 256241a4b83SYohann __syncthreads(); 257241a4b83SYohann } 258241a4b83SYohann 259ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 260ab213215SJeremy L Thompson // 2D tensor contract y 261ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 262241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 263ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 264*18d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 265241a4b83SYohann __syncthreads(); 266241a4b83SYohann *V = 0.0; 267*18d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 268ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 269*18d499f1SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 270241a4b83SYohann __syncthreads(); 271241a4b83SYohann } 272241a4b83SYohann 273ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 274ab213215SJeremy L Thompson // 2D transpose tensor contract y 275ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 276241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 277ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 278*18d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 279241a4b83SYohann __syncthreads(); 280241a4b83SYohann *V = 0.0; 281*18d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 282ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 283*18d499f1SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 284241a4b83SYohann __syncthreads(); 285241a4b83SYohann } 286241a4b83SYohann 287ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 288ab213215SJeremy L Thompson // 2D transpose tensor contract x 289ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 290241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 291ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 292*18d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 293241a4b83SYohann __syncthreads(); 294241a4b83SYohann *V = 0.0; 295*18d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 296ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 297*18d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 298241a4b83SYohann __syncthreads(); 299241a4b83SYohann } 300241a4b83SYohann 301ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 302ab213215SJeremy L Thompson // 2D transpose tensor contract and add x 303ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 304241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 305ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 306*18d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 307241a4b83SYohann __syncthreads(); 308*18d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 309ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 310*18d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 311241a4b83SYohann __syncthreads(); 312241a4b83SYohann } 313241a4b83SYohann 314ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 315ab213215SJeremy L Thompson // 2D interpolate to quadrature points 316ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 317241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 318ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 319241a4b83SYohann CeedScalar r_t[1]; 320ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 321241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 322241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 323241a4b83SYohann } 324241a4b83SYohann } 325241a4b83SYohann 326ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 327ab213215SJeremy L Thompson // 2D interpolate transpose 328ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 329241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 330ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 331241a4b83SYohann CeedScalar r_t[1]; 332ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 333241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 334241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 335241a4b83SYohann } 336241a4b83SYohann } 337241a4b83SYohann 338ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 339ab213215SJeremy L Thompson // 2D derivatives at quadrature points 340ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 341241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 342ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 343241a4b83SYohann CeedScalar r_t[1]; 344ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 345241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 346241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 347241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 348241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 349241a4b83SYohann } 350241a4b83SYohann } 351241a4b83SYohann 352ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 353ab213215SJeremy L Thompson // 2D derivatives transpose 354ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 355241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 356ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 357241a4b83SYohann CeedScalar r_t[1]; 358ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 359241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 360241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 361241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 362241a4b83SYohann ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 363241a4b83SYohann } 364241a4b83SYohann } 365241a4b83SYohann 366ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 367241a4b83SYohann // 3D 368ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 369ab213215SJeremy L Thompson 370ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 371ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 372ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3735c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 3745c7b696cSjeremylt inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 375ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 376241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3774d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 378ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 379ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 3805c7b696cSjeremylt r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 381241a4b83SYohann } 382241a4b83SYohann } 383920dcdc4Sjeremylt 384ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 385ab213215SJeremy L Thompson // L-vector -> E-vector, strided 386ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 387d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 388d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 389ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 390ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 391ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 392d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 393ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 394d80fc06aSjeremylt r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 395ccedf6b0Sjeremylt } 396ccedf6b0Sjeremylt } 397241a4b83SYohann 398ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 399ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided 400ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4015c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d> 4025c7b696cSjeremylt inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 403*18d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 404ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 405920dcdc4Sjeremylt const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 406ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4075c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 408ac421f39SYohann } 409*18d499f1SYohann } 410ac421f39SYohann 411ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 412ab213215SJeremy L Thompson // E-vector -> Q-vector, strided 413ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 414d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 41525dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 416*18d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 417920dcdc4Sjeremylt const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 418d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 419ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 420d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 421920dcdc4Sjeremylt } 422*18d499f1SYohann } 423920dcdc4Sjeremylt 424ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 425ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 426ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4275c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 4285c7b696cSjeremylt inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 429ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 430241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4314d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 432ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 433ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4345c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 435241a4b83SYohann } 436241a4b83SYohann } 437241a4b83SYohann 438ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 439ab213215SJeremy L Thompson // E-vector -> L-vector, strided 440ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 441d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4422f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 443ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 444ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 445ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 446d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 447ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 448d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 449ccedf6b0Sjeremylt } 450ccedf6b0Sjeremylt } 451ccedf6b0Sjeremylt 452ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 453ab213215SJeremy L Thompson // 3D tensor contract x 454ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 455241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 456ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 457ac421f39SYohann CeedScalar r_B[P1d]; 458ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 459ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 460ab213215SJeremy L Thompson 461ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 462*18d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 463241a4b83SYohann __syncthreads(); 464241a4b83SYohann V[k] = 0.0; 465*18d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 466ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 467*18d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 468241a4b83SYohann __syncthreads(); 469241a4b83SYohann } 470241a4b83SYohann } 471241a4b83SYohann 472ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 473ab213215SJeremy L Thompson // 3D tensor contract y 474ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 475241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 476ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 477ac421f39SYohann CeedScalar r_B[P1d]; 478ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 479ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 480ab213215SJeremy L Thompson 481ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 482*18d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 483241a4b83SYohann __syncthreads(); 484241a4b83SYohann V[k] = 0.0; 485*18d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 486ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 487*18d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 488241a4b83SYohann __syncthreads(); 489241a4b83SYohann } 490241a4b83SYohann } 491241a4b83SYohann 492ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 493ab213215SJeremy L Thompson // 3D tensor contract z 494ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 495241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 496ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 497ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 498241a4b83SYohann V[k] = 0.0; 499*18d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 500ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 501ab213215SJeremy L Thompson V[k] += B[i + k*P1d] * U[i]; // Contract z direction 502241a4b83SYohann } 503241a4b83SYohann } 504241a4b83SYohann 505ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 506ab213215SJeremy L Thompson // 3D transpose tensor contract z 507ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 508241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 509ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 510*18d499f1SYohann for (CeedInt k = 0; k < P1d; ++k) { 511241a4b83SYohann V[k] = 0.0; 512*18d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 513ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 514ab213215SJeremy L Thompson V[k] += B[k + i*P1d] * U[i]; // Contract z direction 515241a4b83SYohann } 516241a4b83SYohann } 517241a4b83SYohann 518ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 519ab213215SJeremy L Thompson // 3D transpose tensor contract y 520ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 521241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 522ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 523ac421f39SYohann CeedScalar r_B[Q1d]; 524ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 525ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 526ab213215SJeremy L Thompson 527ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 528*18d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 529241a4b83SYohann __syncthreads(); 530241a4b83SYohann V[k] = 0.0; 531*18d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 532ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 533*18d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 534ac421f39SYohann __syncthreads(); 535ac421f39SYohann } 536ac421f39SYohann } 537ac421f39SYohann 538ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 539ab213215SJeremy L Thompson // 3D transpose tensor contract add y 540ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 541ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 542ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 543ac421f39SYohann CeedScalar r_B[Q1d]; 544ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 545ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 546ab213215SJeremy L Thompson 547ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 548*18d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 549ac421f39SYohann __syncthreads(); 550*18d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 551ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 552*18d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 553241a4b83SYohann __syncthreads(); 554241a4b83SYohann } 555241a4b83SYohann } 556241a4b83SYohann 557ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 558ab213215SJeremy L Thompson // 3D transpose tensor contract x 559ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 560241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 561ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 562ac421f39SYohann CeedScalar r_B[Q1d]; 563ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 564ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 565ab213215SJeremy L Thompson 566ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 567*18d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 568241a4b83SYohann __syncthreads(); 569241a4b83SYohann V[k] = 0.0; 570*18d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 571ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 572*18d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 573241a4b83SYohann __syncthreads(); 574241a4b83SYohann } 575241a4b83SYohann } 576241a4b83SYohann 577ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 578ab213215SJeremy L Thompson // 3D transpose tensor contract add x 579ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 580241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 581ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 582ac421f39SYohann CeedScalar r_B[Q1d]; 583ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 584ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 585ab213215SJeremy L Thompson 586ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 587*18d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 588241a4b83SYohann __syncthreads(); 589*18d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 590ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 591*18d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 592241a4b83SYohann __syncthreads(); 593241a4b83SYohann } 594241a4b83SYohann } 595241a4b83SYohann 596ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 597ab213215SJeremy L Thompson // 3D interpolate to quadrature points 598ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 599241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 600ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 601*18d499f1SYohann CeedScalar r_t1[T1d]; 602*18d499f1SYohann CeedScalar r_t2[T1d]; 603ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 604241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 605241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 606241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 607241a4b83SYohann } 608241a4b83SYohann } 609241a4b83SYohann 610ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 611ab213215SJeremy L Thompson // 3D interpolate transpose 612ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 613241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 614ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 615*18d499f1SYohann CeedScalar r_t1[T1d]; 616*18d499f1SYohann CeedScalar r_t2[T1d]; 617ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 618241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 619241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 620241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 621241a4b83SYohann } 622241a4b83SYohann } 623241a4b83SYohann 624ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 625ab213215SJeremy L Thompson // 3D derivatives at quadrature points 626ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 627241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 628ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 629*18d499f1SYohann CeedScalar r_t1[T1d]; 630*18d499f1SYohann CeedScalar r_t2[T1d]; 631ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 632241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 633241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 634241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 635241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 636241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 637241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 638241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 639241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 640241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 641241a4b83SYohann } 642241a4b83SYohann } 643241a4b83SYohann 644ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 645ab213215SJeremy L Thompson // 3D derivatives transpose 646ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 647241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 648ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 649*18d499f1SYohann CeedScalar r_t1[T1d]; 650*18d499f1SYohann CeedScalar r_t2[T1d]; 651ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 652241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 653241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 654241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 655241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 656241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 657241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 658241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 659241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 660241a4b83SYohann ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 661241a4b83SYohann } 662241a4b83SYohann } 663241a4b83SYohann 664ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 665ab213215SJeremy L Thompson // 3D collocated derivatives computation 666ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 667ac421f39SYohann template <int NCOMP, int Q1d> 668ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 669*18d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 670ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 671*18d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 672ac421f39SYohann __syncthreads(); 673ac421f39SYohann // X derivative 674ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 675ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 676*18d499f1SYohann r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 677ac421f39SYohann // Y derivative 678ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 679ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 680*18d499f1SYohann r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 681ac421f39SYohann // Z derivative 682ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 683ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 684ab213215SJeremy L Thompson r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 685ac421f39SYohann __syncthreads(); 686ac421f39SYohann } 687ac421f39SYohann } 688*18d499f1SYohann } 689ac421f39SYohann 690ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 691ab213215SJeremy L Thompson // 3D collocated derivatives transpose 692ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 693ac421f39SYohann template <int NCOMP, int Q1d> 694ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 695*18d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 696ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 697ac421f39SYohann // X derivative 698*18d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 699ac421f39SYohann __syncthreads(); 700ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 701*18d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 702ac421f39SYohann __syncthreads(); 703ac421f39SYohann // Y derivative 704*18d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 705ac421f39SYohann __syncthreads(); 706ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 707*18d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 708ac421f39SYohann __syncthreads(); 709ac421f39SYohann // Z derivative 710ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 711ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 712ac421f39SYohann } 713ac421f39SYohann } 714*18d499f1SYohann } 715ac421f39SYohann 716ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 717ab213215SJeremy L Thompson // 1D quadrature weights 718ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 719241a4b83SYohann template <int Q1d> 720241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 721*18d499f1SYohann *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 722241a4b83SYohann } 723241a4b83SYohann 724ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 725ab213215SJeremy L Thompson // 2D quadrature weights 726ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 727241a4b83SYohann template <int Q1d> 728241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 729*18d499f1SYohann *w = (data.tidx < Q1d && data.tidy < Q1d) ? 730*18d499f1SYohann qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 731241a4b83SYohann } 732241a4b83SYohann 733ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 734ab213215SJeremy L Thompson // 3D quadrature weights 735ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 736241a4b83SYohann template <int Q1d> 737241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 738*18d499f1SYohann const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 739*18d499f1SYohann const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 740ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 741*18d499f1SYohann w[z] = quad ? pw*qweight1d[z] : 0.0; 742241a4b83SYohann } 743241a4b83SYohann 744241a4b83SYohann ); 745ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 746ab213215SJeremy L Thompson // Build singe operator kernel 747ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 748241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 749241a4b83SYohann 750241a4b83SYohann using std::ostringstream; 751241a4b83SYohann using std::string; 752241a4b83SYohann int ierr; 753241a4b83SYohann bool setupdone; 754fd364f38SJeremy L Thompson ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr); 755241a4b83SYohann if (setupdone) return 0; 756241a4b83SYohann Ceed ceed; 757241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 758241a4b83SYohann CeedOperator_Cuda_gen *data; 7596c845298Sjeremylt ierr = CeedOperatorGetData(op, &data); CeedChk(ierr); 760241a4b83SYohann CeedQFunction qf; 761241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 762241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 7636c845298Sjeremylt ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr); 7641da99368SJeremy L Thompson CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 7655c7b696cSjeremylt numoutputfields, ncomp, dim = 0, lsize; 766241a4b83SYohann ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 767241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 768241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 769241a4b83SYohann CeedChk(ierr); 770241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 771241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 772241a4b83SYohann CeedChk(ierr); 773241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 774241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 775241a4b83SYohann CeedChk(ierr); 776241a4b83SYohann CeedEvalMode emode; 777241a4b83SYohann CeedBasis basis; 778241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 779241a4b83SYohann CeedElemRestriction Erestrict; 780241a4b83SYohann CeedElemRestriction_Cuda_reg *restr_data; 781241a4b83SYohann 782241a4b83SYohann ostringstream code; 783241a4b83SYohann string devFunctions(deviceFunctions); 784241a4b83SYohann 785f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 786f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 787f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 7886c845298Sjeremylt ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr); 789f1a13f77SYohann Dudouit ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 790f1a13f77SYohann Dudouit if (prop.major<6){ 791f1a13f77SYohann Dudouit code << atomicAdd; 792f1a13f77SYohann Dudouit } 793f1a13f77SYohann Dudouit 794241a4b83SYohann code << devFunctions; 795241a4b83SYohann 796241a4b83SYohann string qFunction(qf_data->qFunctionSource); 797c3c443faSYohann Dudouit string qFunctionName(qf_data->qFunctionName); 798c3c443faSYohann Dudouit string oper; 79914cce66cSYohann Dudouit oper = "CeedKernel_Cuda_gen_" + qFunctionName; 8004d537eeaSYohann 8014d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 802ee07ded2SValeria Barra code << "\n#define CeedPragmaSIMD\n"; 8031da99368SJeremy L Thompson 8041da99368SJeremy L Thompson // Find dim and Q1d 80564d3f0c0Sjeremylt bool useCollograd = true; 806*18d499f1SYohann data->maxP1d = 0; 8071da99368SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 8081da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 8091da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 8106c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 8110f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 8120f54b25eSjeremylt CeedChk(ierr); 8131da99368SJeremy L Thompson 8141da99368SJeremy L Thompson // Check for collocated gradient 815*18d499f1SYohann useCollograd = useCollograd && basis_data->d_collograd1d; 8161da99368SJeremy L Thompson 8171da99368SJeremy L Thompson // Collect dim and Q1d 8181da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 8191da99368SJeremy L Thompson bool isTensor; 820fd364f38SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 8211da99368SJeremy L Thompson if (isTensor) { 8221da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 823*18d499f1SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 824*18d499f1SYohann if (P1d>data->maxP1d) data->maxP1d = P1d; 8251da99368SJeremy L Thompson } else { 826e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 8271da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 828e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8291da99368SJeremy L Thompson } 8301da99368SJeremy L Thompson } 8311da99368SJeremy L Thompson } 8321da99368SJeremy L Thompson // Check output bases for Q1d, dim as well 8331da99368SJeremy L Thompson // The only imput basis might be CEED_BASIS_COLLOCATED 8341da99368SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 8351da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 8360f54b25eSjeremylt 8371da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 8386c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 8390f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 8400f54b25eSjeremylt CeedChk(ierr); 8410f54b25eSjeremylt 8421da99368SJeremy L Thompson // Collect dim and Q1d 8431da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 8441da99368SJeremy L Thompson bool isTensor; 845fd364f38SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 8461da99368SJeremy L Thompson if (isTensor) { 8471da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 8481da99368SJeremy L Thompson } else { 849e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 8501da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 851e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8521da99368SJeremy L Thompson } 8530f54b25eSjeremylt 8540f54b25eSjeremylt // Check for collocated gradient 85564d3f0c0Sjeremylt useCollograd = useCollograd && basis_data->d_collograd1d; 8561da99368SJeremy L Thompson } 8571da99368SJeremy L Thompson } 8581da99368SJeremy L Thompson data->dim = dim; 8591da99368SJeremy L Thompson data->Q1d = Q1d; 8601da99368SJeremy L Thompson 8611da99368SJeremy L Thompson // Define CEED_Q_VLA 86264d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 8631da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA 1\n\n"; 8641da99368SJeremy L Thompson } else { 8651da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 8661da99368SJeremy L Thompson } 8671da99368SJeremy L Thompson 868241a4b83SYohann code << qFunction; 869241a4b83SYohann 870241a4b83SYohann // Setup 871818e0025SJeremy L Thompson code << "\n// -----------------------------------------------------------------------------\n"; 872c3c443faSYohann Dudouit code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 873241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 874241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 875241a4b83SYohann CeedChk(ierr); 8761da99368SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 877241a4b83SYohann code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 878241a4b83SYohann } 879241a4b83SYohann } 880241a4b83SYohann 881241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 882241a4b83SYohann code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 883241a4b83SYohann } 8841da99368SJeremy L Thompson 885241a4b83SYohann code << " const CeedInt Dim = "<<dim<<";\n"; 886241a4b83SYohann code << " const CeedInt Q1d = "<<Q1d<<";\n"; 8871da99368SJeremy L Thompson 888241a4b83SYohann code << " extern __shared__ CeedScalar slice[];\n"; 889241a4b83SYohann code << " BackendData data;\n"; 890241a4b83SYohann code << " data.tidx = threadIdx.x;\n"; 891241a4b83SYohann code << " data.tidy = threadIdx.y;\n"; 892241a4b83SYohann code << " data.tidz = threadIdx.z;\n"; 893241a4b83SYohann code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 894*18d499f1SYohann code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 895920dcdc4Sjeremylt 896818e0025SJeremy L Thompson code << "\n // -- Input field constants and basis data --\n"; 897ac421f39SYohann //Initialize constants, and matrices B and G 898241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 899818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 900241a4b83SYohann // Get elemsize, emode, ncomp 901241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 902241a4b83SYohann CeedChk(ierr); 903241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 904241a4b83SYohann CeedChk(ierr); 905241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 906241a4b83SYohann CeedChk(ierr); 9074d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 908241a4b83SYohann CeedChk(ierr); 909920dcdc4Sjeremylt 910920dcdc4Sjeremylt // Set field constants 911920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT) { 912920dcdc4Sjeremylt ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 913920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 914920dcdc4Sjeremylt ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 915920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 916920dcdc4Sjeremylt } else { 917920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 918920dcdc4Sjeremylt } 919920dcdc4Sjeremylt code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 920920dcdc4Sjeremylt } 921920dcdc4Sjeremylt 922920dcdc4Sjeremylt // Load basis data 923920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 924241a4b83SYohann switch (emode) { 925241a4b83SYohann case CEED_EVAL_NONE: 926241a4b83SYohann break; 927241a4b83SYohann case CEED_EVAL_INTERP: 9286c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 929241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 930241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 931241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 932241a4b83SYohann break; 933241a4b83SYohann case CEED_EVAL_GRAD: 9346c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 935241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 936241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 937241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 93864d3f0c0Sjeremylt if (useCollograd) { 939ac421f39SYohann data->G.in[i] = basis_data->d_collograd1d; 940ac421f39SYohann code << " __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 941ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 942ac421f39SYohann } else { 943ac421f39SYohann data->G.in[i] = basis_data->d_grad1d; 944ac421f39SYohann code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 945241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 946ac421f39SYohann } 947241a4b83SYohann break; 948241a4b83SYohann case CEED_EVAL_WEIGHT: 949241a4b83SYohann break; // No action 950241a4b83SYohann case CEED_EVAL_DIV: 951241a4b83SYohann break; // TODO: Not implemented 952241a4b83SYohann case CEED_EVAL_CURL: 953241a4b83SYohann break; // TODO: Not implemented 954241a4b83SYohann } 955241a4b83SYohann } 956920dcdc4Sjeremylt 957818e0025SJeremy L Thompson code << "\n // -- Output field constants and basis data --\n"; 958241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 959818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 960241a4b83SYohann // Get elemsize, emode, ncomp 961241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 962241a4b83SYohann CeedChk(ierr); 963241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 964241a4b83SYohann CeedChk(ierr); 965241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 966241a4b83SYohann CeedChk(ierr); 9674d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 968241a4b83SYohann CeedChk(ierr); 969920dcdc4Sjeremylt 970920dcdc4Sjeremylt // Set field constants 971241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 972920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 973241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 974241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 975920dcdc4Sjeremylt } else { 976920dcdc4Sjeremylt code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 977920dcdc4Sjeremylt } 978920dcdc4Sjeremylt code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 979920dcdc4Sjeremylt 980920dcdc4Sjeremylt // Load basis data 981920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 982920dcdc4Sjeremylt switch (emode) { 983920dcdc4Sjeremylt case CEED_EVAL_NONE: 984920dcdc4Sjeremylt break; // No action 985920dcdc4Sjeremylt case CEED_EVAL_INTERP: 9866c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 987241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 988241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 989241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 990241a4b83SYohann break; 991241a4b83SYohann case CEED_EVAL_GRAD: 9926c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 993241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 994241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 995241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 99664d3f0c0Sjeremylt if (useCollograd) { 997ac421f39SYohann data->G.out[i] = basis_data->d_collograd1d; 998ac421f39SYohann code << " __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 999ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1000ac421f39SYohann } else { 1001ac421f39SYohann data->G.out[i] = basis_data->d_grad1d; 1002ac421f39SYohann code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1003241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1004ac421f39SYohann } 1005241a4b83SYohann break; 1006e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1007241a4b83SYohann case CEED_EVAL_WEIGHT: { 1008241a4b83SYohann Ceed ceed; 1009241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1010241a4b83SYohann return CeedError(ceed, 1, 1011241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1012241a4b83SYohann break; // Should not occur 1013241a4b83SYohann } 1014241a4b83SYohann case CEED_EVAL_DIV: 1015241a4b83SYohann break; // TODO: Not implemented 1016241a4b83SYohann case CEED_EVAL_CURL: 1017241a4b83SYohann break; // TODO: Not implemented 1018e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1019241a4b83SYohann } 1020241a4b83SYohann } 1021818e0025SJeremy L Thompson code << "\n // -- Element loop --\n"; 1022ac421f39SYohann code << " __syncthreads();\n"; 1023241a4b83SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1024241a4b83SYohann // Input basis apply if needed 1025ac421f39SYohann // Generate the correct eval mode code for each input 1026818e0025SJeremy L Thompson code << " // -- Input field restrictions and basis actions --\n"; 1027241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1028818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1029241a4b83SYohann // Get elemsize, emode, ncomp 1030241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1031241a4b83SYohann CeedChk(ierr); 1032241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1033241a4b83SYohann CeedChk(ierr); 1034241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1035241a4b83SYohann CeedChk(ierr); 10364d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1037241a4b83SYohann CeedChk(ierr); 1038920dcdc4Sjeremylt 1039920dcdc4Sjeremylt // Restriction 1040920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT && 104164d3f0c0Sjeremylt !((emode == CEED_EVAL_NONE) && useCollograd)) { 1042241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 10439a2291e3SJeremy L Thompson 10449a2291e3SJeremy L Thompson bool isStrided; 1045fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 10469a2291e3SJeremy L Thompson if (!isStrided) { 10475c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 10485c7b696cSjeremylt CeedChk(ierr); 10495c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 10505c7b696cSjeremylt CeedInt compstride; 10515c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 10525c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 10536c845298Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 10549a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 10555c7b696cSjeremylt code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1056ccedf6b0Sjeremylt } else { 1057920dcdc4Sjeremylt bool backendstrides; 1058fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1059920dcdc4Sjeremylt CeedChk(ierr); 106013585805SJeremy L Thompson CeedInt nelem; 106113585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 106213585805SJeremy L Thompson CeedChk(ierr); 106313585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1064920dcdc4Sjeremylt if (!backendstrides) { 1065920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1066920dcdc4Sjeremylt CeedChk(ierr); 1067ccedf6b0Sjeremylt } 1068920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1069d80fc06aSjeremylt code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1070920dcdc4Sjeremylt } 1071920dcdc4Sjeremylt } 1072920dcdc4Sjeremylt 1073920dcdc4Sjeremylt // Basis action 1074920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1075920dcdc4Sjeremylt switch (emode) { 1076920dcdc4Sjeremylt case CEED_EVAL_NONE: 107764d3f0c0Sjeremylt if (!useCollograd) { 1078920dcdc4Sjeremylt code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1079920dcdc4Sjeremylt } 1080920dcdc4Sjeremylt break; 1081920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1082241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1083241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1084241a4b83SYohann break; 1085241a4b83SYohann case CEED_EVAL_GRAD: 108664d3f0c0Sjeremylt if (useCollograd) { 1087ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1088ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1089ac421f39SYohann } else { 1090241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1091241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1092ac421f39SYohann } 1093241a4b83SYohann break; 1094241a4b83SYohann case CEED_EVAL_WEIGHT: 1095241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1096241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 10976c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 1098241a4b83SYohann data->W = basis_data->d_qweight1d; 1099241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1100241a4b83SYohann break; // No action 1101241a4b83SYohann case CEED_EVAL_DIV: 1102241a4b83SYohann break; // TODO: Not implemented 1103241a4b83SYohann case CEED_EVAL_CURL: 1104241a4b83SYohann break; // TODO: Not implemented 1105241a4b83SYohann } 1106241a4b83SYohann } 1107ac421f39SYohann 1108241a4b83SYohann // Q function 1109818e0025SJeremy L Thompson code << "\n // -- Output field setup --\n"; 1110241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1111818e0025SJeremy L Thompson code << "\n // ---- Output field "<<i<<" ----\n"; 1112241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1113241a4b83SYohann CeedChk(ierr); 1114241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1115241a4b83SYohann { 111664d3f0c0Sjeremylt if (useCollograd) { 1117ac421f39SYohann //Accumulator for gradient slices 1118ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1119ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1120ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1121ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1122ac421f39SYohann code << " }\n"; 1123ac421f39SYohann code << " }\n"; 1124ac421f39SYohann } else { 1125241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1126241a4b83SYohann } 1127ac421f39SYohann } 1128241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1129241a4b83SYohann { 1130241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1131241a4b83SYohann } 1132241a4b83SYohann } 1133ac421f39SYohann // We treat quadrature points per slice in 3d to save registers 113464d3f0c0Sjeremylt if (useCollograd) { 1135920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1136ac421f39SYohann code << "#pragma unroll\n"; 1137ac421f39SYohann code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1138818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1139ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1140818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1141ac421f39SYohann // Get elemsize, emode, ncomp 1142ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1143ac421f39SYohann CeedChk(ierr); 1144ac421f39SYohann // Basis action 1145920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1146ac421f39SYohann switch (emode) { 1147ac421f39SYohann case CEED_EVAL_NONE: 1148ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 11499a2291e3SJeremy L Thompson 11509a2291e3SJeremy L Thompson bool isStrided; 1151792ff326SYohann Dudouit ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr); 115213585805SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr); 1153fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 11549a2291e3SJeremy L Thompson if (!isStrided) { 11555c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 11565c7b696cSjeremylt CeedChk(ierr); 11575c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 11585c7b696cSjeremylt CeedInt compstride; 11595c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 11605c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 11616c845298Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 11629a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 11635c7b696cSjeremylt code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1164920dcdc4Sjeremylt } else { 1165920dcdc4Sjeremylt bool backendstrides; 1166fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1167920dcdc4Sjeremylt CeedChk(ierr); 116813585805SJeremy L Thompson CeedInt nelem; 116913585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 117013585805SJeremy L Thompson CeedChk(ierr); 117113585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1172920dcdc4Sjeremylt if (!backendstrides) { 1173920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1174920dcdc4Sjeremylt CeedChk(ierr); 1175920dcdc4Sjeremylt } 1176920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1177d80fc06aSjeremylt code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1178920dcdc4Sjeremylt } 1179ac421f39SYohann break; 1180ac421f39SYohann case CEED_EVAL_INTERP: 1181ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1182ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1183ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1184ac421f39SYohann code << " }\n"; 1185ac421f39SYohann break; 1186ac421f39SYohann case CEED_EVAL_GRAD: 1187ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1188ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1189ac421f39SYohann break; 1190ac421f39SYohann case CEED_EVAL_WEIGHT: 1191ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1192ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1193ac421f39SYohann break; // No action 1194ac421f39SYohann case CEED_EVAL_DIV: 1195ac421f39SYohann break; // TODO: Not implemented 1196ac421f39SYohann case CEED_EVAL_CURL: 1197ac421f39SYohann break; // TODO: Not implemented 1198ac421f39SYohann } 1199ac421f39SYohann } 1200818e0025SJeremy L Thompson code << "\n // -- Output fields --\n"; 1201ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1202818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1203ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1204ac421f39SYohann CeedChk(ierr); 1205ac421f39SYohann // Basis action 1206ac421f39SYohann switch (emode) { 1207ac421f39SYohann case CEED_EVAL_NONE: 1208ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1209ac421f39SYohann break; // No action 1210ac421f39SYohann case CEED_EVAL_INTERP: 1211ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1212ac421f39SYohann break; 1213ac421f39SYohann case CEED_EVAL_GRAD: 1214ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1215ac421f39SYohann break; 1216ac421f39SYohann case CEED_EVAL_WEIGHT: 1217ac421f39SYohann break; // Should not occur 1218ac421f39SYohann case CEED_EVAL_DIV: 1219ac421f39SYohann break; // TODO: Not implemented 1220ac421f39SYohann case CEED_EVAL_CURL: 1221ac421f39SYohann break; // TODO: Not implemented 1222ac421f39SYohann } 1223ac421f39SYohann } 1224ac421f39SYohann } else { 1225920dcdc4Sjeremylt code << "\n // Note: No Collocated Gradient\n"; 1226818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1227ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1228818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1229ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1230ac421f39SYohann } 1231818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1232ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1233818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1234ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1235ac421f39SYohann } 1236ac421f39SYohann } 1237818e0025SJeremy L Thompson code << "\n // -- QFunction Inputs and outputs --\n"; 12384d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 12394d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1240818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1241ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 12424d537eeaSYohann } 12434d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 12444d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1245818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1246ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 12474d537eeaSYohann } 1248818e0025SJeremy L Thompson code << "\n // -- Apply QFunction --\n"; 1249ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 125064d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 1251ac421f39SYohann code << "1"; 1252ac421f39SYohann } else { 1253ac421f39SYohann code << "Q1d"; 1254ac421f39SYohann } 1255ac421f39SYohann code << ", in, out);\n"; 125664d3f0c0Sjeremylt if (useCollograd) { 1257920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1258818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1259ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1260818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1261ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1262ac421f39SYohann CeedChk(ierr); 1263ac421f39SYohann // Basis action 1264920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1265ac421f39SYohann switch (emode) { 1266ac421f39SYohann case CEED_EVAL_NONE: 1267ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1268ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1269ac421f39SYohann code << " }\n"; 1270ac421f39SYohann break; // No action 1271ac421f39SYohann case CEED_EVAL_INTERP: 1272ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1273ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1274ac421f39SYohann code << " }\n"; 1275ac421f39SYohann break; 1276ac421f39SYohann case CEED_EVAL_GRAD: 1277ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1278ac421f39SYohann break; 1279ac421f39SYohann case CEED_EVAL_WEIGHT: 1280ac421f39SYohann break; // Should not occur 1281ac421f39SYohann case CEED_EVAL_DIV: 1282ac421f39SYohann break; // TODO: Not implemented 1283ac421f39SYohann case CEED_EVAL_CURL: 1284ac421f39SYohann break; // TODO: Not implemented 1285ac421f39SYohann } 1286ac421f39SYohann } 1287ac421f39SYohann code << " }\n"; 1288ac421f39SYohann } 1289241a4b83SYohann 1290241a4b83SYohann // Output basis apply if needed 1291ac421f39SYohann // Generate the correct eval mode code for each output 1292818e0025SJeremy L Thompson code << "\n // -- Output field basis action and restrictions --\n"; 1293241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1294818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1295241a4b83SYohann // Get elemsize, emode, ncomp 1296241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1297241a4b83SYohann CeedChk(ierr); 1298241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1299241a4b83SYohann CeedChk(ierr); 1300241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1301241a4b83SYohann CeedChk(ierr); 13024d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1303241a4b83SYohann CeedChk(ierr); 1304241a4b83SYohann // Basis action 1305920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1306241a4b83SYohann switch (emode) { 1307241a4b83SYohann case CEED_EVAL_NONE: 1308920dcdc4Sjeremylt code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1309241a4b83SYohann break; // No action 1310241a4b83SYohann case CEED_EVAL_INTERP: 1311241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1312241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1313241a4b83SYohann break; 1314241a4b83SYohann case CEED_EVAL_GRAD: 1315241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 131664d3f0c0Sjeremylt if (useCollograd) { 1317ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1318ac421f39SYohann } else { 1319241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1320ac421f39SYohann } 1321241a4b83SYohann break; 1322e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1323241a4b83SYohann case CEED_EVAL_WEIGHT: { 1324241a4b83SYohann Ceed ceed; 1325241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1326241a4b83SYohann return CeedError(ceed, 1, 1327241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1328241a4b83SYohann break; // Should not occur 1329241a4b83SYohann } 1330241a4b83SYohann case CEED_EVAL_DIV: 1331241a4b83SYohann break; // TODO: Not implemented 1332241a4b83SYohann case CEED_EVAL_CURL: 1333241a4b83SYohann break; // TODO: Not implemented 1334e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1335241a4b83SYohann } 1336920dcdc4Sjeremylt // Restriction 13379a2291e3SJeremy L Thompson bool isStrided; 1338fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 13399a2291e3SJeremy L Thompson if (!isStrided) { 13405c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 13415c7b696cSjeremylt CeedChk(ierr); 13425c7b696cSjeremylt code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 13435c7b696cSjeremylt CeedInt compstride; 13445c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 13455c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 13466c845298Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 13479a2291e3SJeremy L Thompson data->indices.out[i] = restr_data->d_ind; 13485c7b696cSjeremylt code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1349920dcdc4Sjeremylt } else { 1350920dcdc4Sjeremylt bool backendstrides; 1351fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1352920dcdc4Sjeremylt CeedChk(ierr); 135313585805SJeremy L Thompson CeedInt nelem; 135413585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 135513585805SJeremy L Thompson CeedChk(ierr); 135613585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1357920dcdc4Sjeremylt if (!backendstrides) { 1358920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1359920dcdc4Sjeremylt CeedChk(ierr); 1360920dcdc4Sjeremylt } 1361920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1362d80fc06aSjeremylt code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1363920dcdc4Sjeremylt } 1364241a4b83SYohann } 1365241a4b83SYohann 1366241a4b83SYohann code << " }\n"; 1367818e0025SJeremy L Thompson code << "}\n"; 1368818e0025SJeremy L Thompson code << "// -----------------------------------------------------------------------------\n\n"; 1369241a4b83SYohann 1370ab213215SJeremy L Thompson // View kernel for debugging 1371b8e71988SJeremy L Thompson CeedDebug(code.str().c_str()); 1372241a4b83SYohann 1373*18d499f1SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 1374*18d499f1SYohann "T1d", CeedIntMax(Q1d, data->maxP1d)); 1375920dcdc4Sjeremylt CeedChk(ierr); 1376c3c443faSYohann Dudouit ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1377241a4b83SYohann CeedChk(ierr); 1378241a4b83SYohann 1379241a4b83SYohann ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1380241a4b83SYohann return 0; 1381241a4b83SYohann } 1382ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1383