1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12 177df94212SJeremy L Thompson 18241a4b83SYohann #include "ceed-cuda-gen.h" 19241a4b83SYohann #include <iostream> 20241a4b83SYohann #include <sstream> 21241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 22241a4b83SYohann 23f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 24ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 25ab213215SJeremy L Thompson // Atomic add, for older CUDA 26ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 27241a4b83SYohann __device__ double atomicAdd(double *address, double val) { 28241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 29241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 30241a4b83SYohann do { 31241a4b83SYohann assumed = old; 32241a4b83SYohann old = 33241a4b83SYohann atomicCAS(address_as_ull, assumed, 34241a4b83SYohann __double_as_longlong(val + 35241a4b83SYohann __longlong_as_double(assumed))); 36241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 37241a4b83SYohann // (since NaN != NaN) 38241a4b83SYohann } while (assumed != old); 39241a4b83SYohann return __longlong_as_double(old); 40241a4b83SYohann } 41f1a13f77SYohann Dudouit ); 42f1a13f77SYohann Dudouit 43f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 44f1a13f77SYohann Dudouit 45ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 46ab213215SJeremy L Thompson // Typedefs 47ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 48f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 49f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 50f1a13f77SYohann Dudouit 51f1a13f77SYohann Dudouit typedef struct { 52f1a13f77SYohann Dudouit CeedInt tidx; 53f1a13f77SYohann Dudouit CeedInt tidy; 54f1a13f77SYohann Dudouit CeedInt tidz; 55f1a13f77SYohann Dudouit CeedInt tid; 56f1a13f77SYohann Dudouit CeedScalar* slice; 57f1a13f77SYohann Dudouit } BackendData; 58241a4b83SYohann 59ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 60ab213215SJeremy L Thompson // Load matrices for basis actions 61ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 62241a4b83SYohann template <int P, int Q> 63241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 64ab213215SJeremy L Thompson for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 65241a4b83SYohann B[i] = d_B[i]; 66241a4b83SYohann } 67241a4b83SYohann 68ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 69241a4b83SYohann // 1D 70ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 71ab213215SJeremy L Thompson 72ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 73ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 74ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 755c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 765c7b696cSjeremylt inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77ab213215SJeremy L Thompson if (data.tidx < P1d) { 784d537eeaSYohann const CeedInt node = data.tidx; 79ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 80ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 815c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 82241a4b83SYohann } 83241a4b83SYohann } 84920dcdc4Sjeremylt 85ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 86ab213215SJeremy L Thompson // L-vector -> E-vector, strided 87ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 88d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 89d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 90ab213215SJeremy L Thompson if (data.tidx < P1d) { 91ccedf6b0Sjeremylt const CeedInt node = data.tidx; 92d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 93ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 94d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 95ccedf6b0Sjeremylt } 96ccedf6b0Sjeremylt } 97241a4b83SYohann 98ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 99ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 100ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1015c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 1025c7b696cSjeremylt inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 103ab213215SJeremy L Thompson if (data.tidx < P1d) { 1044d537eeaSYohann const CeedInt node = data.tidx; 105ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 106ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 1075c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 108241a4b83SYohann } 109241a4b83SYohann } 110241a4b83SYohann 111ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 112ab213215SJeremy L Thompson // E-vector -> L-vector, strided 113ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 114d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 115d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 116ab213215SJeremy L Thompson if (data.tidx < P1d) { 117ccedf6b0Sjeremylt const CeedInt node = data.tidx; 118d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 119ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 120d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 121ccedf6b0Sjeremylt } 122ccedf6b0Sjeremylt } 123ccedf6b0Sjeremylt 124ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 125ab213215SJeremy L Thompson // 1D tensor contraction x 126ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 127241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 128ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 129241a4b83SYohann data.slice[data.tidx] = *U; 130241a4b83SYohann __syncthreads(); 131241a4b83SYohann *V = 0.0; 13218d499f1SYohann if (data.tidx < Q1d) 133ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 134ab213215SJeremy L Thompson *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 135241a4b83SYohann __syncthreads(); 136241a4b83SYohann } 137241a4b83SYohann 138ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 139ab213215SJeremy L Thompson // 1D transpose tensor contraction x 140ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 141241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 142ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 143241a4b83SYohann data.slice[data.tidx] = *U; 144241a4b83SYohann __syncthreads(); 145241a4b83SYohann *V = 0.0; 14618d499f1SYohann if (data.tidx < P1d) 147ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 148ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 149241a4b83SYohann __syncthreads(); 150241a4b83SYohann } 151241a4b83SYohann 152ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 153ab213215SJeremy L Thompson // 1D interpolate to quadrature points 154ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 155241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 156ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 157ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 158241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 159241a4b83SYohann } 160241a4b83SYohann 161ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 162ab213215SJeremy L Thompson // 1D interpolate transpose 163ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 164241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 165ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 166ab213215SJeremy L Thompson for (CeedInt comp=0; comp<NCOMP; comp++) 167241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 168241a4b83SYohann } 169241a4b83SYohann 170ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 171ab213215SJeremy L Thompson // 1D derivatives at quadrature points 172ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 173241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 174ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 175ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 176241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 177241a4b83SYohann } 178241a4b83SYohann 179ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 180ab213215SJeremy L Thompson // 1D derivatives transpose 181ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 182241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 183ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 184ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 185241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 186241a4b83SYohann } 187241a4b83SYohann 188ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 189241a4b83SYohann // 2D 190ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 191ab213215SJeremy L Thompson 192ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 193ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 194ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1955c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 1965c7b696cSjeremylt inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 197ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 1984d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 199ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 200ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2015c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 202241a4b83SYohann } 203241a4b83SYohann } 204920dcdc4Sjeremylt 205ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 206ab213215SJeremy L Thompson // L-vector -> E-vector, strided 207ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 208d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 209d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 210ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 211ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 212d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 213ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 214d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 215ccedf6b0Sjeremylt } 216ccedf6b0Sjeremylt } 217241a4b83SYohann 218ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 219ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 220ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 2215c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 2225c7b696cSjeremylt inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 223ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 2244d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 225ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 226ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2275c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 228241a4b83SYohann } 229241a4b83SYohann } 230241a4b83SYohann 231ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 232ab213215SJeremy L Thompson // E-vector -> L-vector, strided 233ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 234d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 235d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 236ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 237ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 238d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 239ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 240d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 241ccedf6b0Sjeremylt } 242ccedf6b0Sjeremylt } 243ccedf6b0Sjeremylt 244ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 245ab213215SJeremy L Thompson // 2D tensor contraction x 246ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 247241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 248ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 24918d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 250241a4b83SYohann __syncthreads(); 251241a4b83SYohann *V = 0.0; 25218d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 253ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 25418d499f1SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 255241a4b83SYohann __syncthreads(); 256241a4b83SYohann } 257241a4b83SYohann 258ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 259ab213215SJeremy L Thompson // 2D tensor contract y 260ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 261241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 262ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 26318d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 264241a4b83SYohann __syncthreads(); 265241a4b83SYohann *V = 0.0; 26618d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 267ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 26818d499f1SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 269241a4b83SYohann __syncthreads(); 270241a4b83SYohann } 271241a4b83SYohann 272ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 273ab213215SJeremy L Thompson // 2D transpose tensor contract y 274ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 275241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 276ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 27718d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 278241a4b83SYohann __syncthreads(); 279241a4b83SYohann *V = 0.0; 28018d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 281ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 28218d499f1SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 283241a4b83SYohann __syncthreads(); 284241a4b83SYohann } 285241a4b83SYohann 286ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 287ab213215SJeremy L Thompson // 2D transpose tensor contract x 288ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 289241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 290ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 29118d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 292241a4b83SYohann __syncthreads(); 293241a4b83SYohann *V = 0.0; 29418d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 295ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 29618d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 297241a4b83SYohann __syncthreads(); 298241a4b83SYohann } 299241a4b83SYohann 300ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 301ab213215SJeremy L Thompson // 2D transpose tensor contract and add x 302ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 303241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 304ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 30518d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 306241a4b83SYohann __syncthreads(); 30718d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 308ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 30918d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 310241a4b83SYohann __syncthreads(); 311241a4b83SYohann } 312241a4b83SYohann 313ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 314ab213215SJeremy L Thompson // 2D interpolate to quadrature points 315ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 316241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 317ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 318241a4b83SYohann CeedScalar r_t[1]; 319ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 320241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 321241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 322241a4b83SYohann } 323241a4b83SYohann } 324241a4b83SYohann 325ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 326ab213215SJeremy L Thompson // 2D interpolate transpose 327ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 328241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 329ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 330241a4b83SYohann CeedScalar r_t[1]; 331ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 332241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 333241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 334241a4b83SYohann } 335241a4b83SYohann } 336241a4b83SYohann 337ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 338ab213215SJeremy L Thompson // 2D derivatives at quadrature points 339ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 340241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 341ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 342241a4b83SYohann CeedScalar r_t[1]; 343ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 344241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 345241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 346241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 347241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 348241a4b83SYohann } 349241a4b83SYohann } 350241a4b83SYohann 351ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 352ab213215SJeremy L Thompson // 2D derivatives transpose 353ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 354241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 355ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 356241a4b83SYohann CeedScalar r_t[1]; 357ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 358241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 359241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 360241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 361241a4b83SYohann ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 362241a4b83SYohann } 363241a4b83SYohann } 364241a4b83SYohann 365ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 366241a4b83SYohann // 3D 367ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 368ab213215SJeremy L Thompson 369ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 370ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 371ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3725c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 3735c7b696cSjeremylt inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 374ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 375241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3764d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 377ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 378ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 3795c7b696cSjeremylt r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 380241a4b83SYohann } 381241a4b83SYohann } 382920dcdc4Sjeremylt 383ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 384ab213215SJeremy L Thompson // L-vector -> E-vector, strided 385ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 386d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 387d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 388ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 389ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 390ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 391d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 392ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 393d80fc06aSjeremylt r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 394ccedf6b0Sjeremylt } 395ccedf6b0Sjeremylt } 396241a4b83SYohann 397ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 398ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided 399ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4005c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d> 4015c7b696cSjeremylt inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 40218d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 403ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 404920dcdc4Sjeremylt const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 405ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4065c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 407ac421f39SYohann } 40818d499f1SYohann } 409ac421f39SYohann 410ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 411ab213215SJeremy L Thompson // E-vector -> Q-vector, strided 412ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 413d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 41425dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 41518d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 416920dcdc4Sjeremylt const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 417d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 418ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 419d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 420920dcdc4Sjeremylt } 42118d499f1SYohann } 422920dcdc4Sjeremylt 423ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 424ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 425ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4265c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 4275c7b696cSjeremylt inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 428ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 429241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4304d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 431ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 432ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4335c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 434241a4b83SYohann } 435241a4b83SYohann } 436241a4b83SYohann 437ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 438ab213215SJeremy L Thompson // E-vector -> L-vector, strided 439ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 440d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4412f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 442ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 443ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 444ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 445d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 446ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 447d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 448ccedf6b0Sjeremylt } 449ccedf6b0Sjeremylt } 450ccedf6b0Sjeremylt 451ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 452ab213215SJeremy L Thompson // 3D tensor contract x 453ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 454241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 455ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 456ac421f39SYohann CeedScalar r_B[P1d]; 457ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 458ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 459ab213215SJeremy L Thompson 460ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 46118d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 462241a4b83SYohann __syncthreads(); 463241a4b83SYohann V[k] = 0.0; 46418d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 465ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 46618d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 467241a4b83SYohann __syncthreads(); 468241a4b83SYohann } 469241a4b83SYohann } 470241a4b83SYohann 471ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 472ab213215SJeremy L Thompson // 3D tensor contract y 473ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 474241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 475ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 476ac421f39SYohann CeedScalar r_B[P1d]; 477ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 478ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 479ab213215SJeremy L Thompson 480ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 48118d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 482241a4b83SYohann __syncthreads(); 483241a4b83SYohann V[k] = 0.0; 48418d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 485ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 48618d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 487241a4b83SYohann __syncthreads(); 488241a4b83SYohann } 489241a4b83SYohann } 490241a4b83SYohann 491ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 492ab213215SJeremy L Thompson // 3D tensor contract z 493ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 494241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 495ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 496ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 497241a4b83SYohann V[k] = 0.0; 49818d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 499ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 500ab213215SJeremy L Thompson V[k] += B[i + k*P1d] * U[i]; // Contract z direction 501241a4b83SYohann } 502241a4b83SYohann } 503241a4b83SYohann 504ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 505ab213215SJeremy L Thompson // 3D transpose tensor contract z 506ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 507241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 508ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 50918d499f1SYohann for (CeedInt k = 0; k < P1d; ++k) { 510241a4b83SYohann V[k] = 0.0; 51118d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 512ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 513ab213215SJeremy L Thompson V[k] += B[k + i*P1d] * U[i]; // Contract z direction 514241a4b83SYohann } 515241a4b83SYohann } 516241a4b83SYohann 517ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 518ab213215SJeremy L Thompson // 3D transpose tensor contract y 519ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 520241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 521ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 522ac421f39SYohann CeedScalar r_B[Q1d]; 523ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 524ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 525ab213215SJeremy L Thompson 526ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 52718d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 528241a4b83SYohann __syncthreads(); 529241a4b83SYohann V[k] = 0.0; 53018d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 531ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 53218d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 533ac421f39SYohann __syncthreads(); 534ac421f39SYohann } 535ac421f39SYohann } 536ac421f39SYohann 537ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 538ab213215SJeremy L Thompson // 3D transpose tensor contract add y 539ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 540ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 541ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 542ac421f39SYohann CeedScalar r_B[Q1d]; 543ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 544ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 545ab213215SJeremy L Thompson 546ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 54718d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 548ac421f39SYohann __syncthreads(); 54918d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 550ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 55118d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 552241a4b83SYohann __syncthreads(); 553241a4b83SYohann } 554241a4b83SYohann } 555241a4b83SYohann 556ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 557ab213215SJeremy L Thompson // 3D transpose tensor contract x 558ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 559241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 560ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 561ac421f39SYohann CeedScalar r_B[Q1d]; 562ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 563ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 564ab213215SJeremy L Thompson 565ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 56618d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 567241a4b83SYohann __syncthreads(); 568241a4b83SYohann V[k] = 0.0; 56918d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 570ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 57118d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 572241a4b83SYohann __syncthreads(); 573241a4b83SYohann } 574241a4b83SYohann } 575241a4b83SYohann 576ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 577ab213215SJeremy L Thompson // 3D transpose tensor contract add x 578ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 579241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 580ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 581ac421f39SYohann CeedScalar r_B[Q1d]; 582ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 583ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 584ab213215SJeremy L Thompson 585ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 58618d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 587241a4b83SYohann __syncthreads(); 58818d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 589ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 59018d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 591241a4b83SYohann __syncthreads(); 592241a4b83SYohann } 593241a4b83SYohann } 594241a4b83SYohann 595ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 596ab213215SJeremy L Thompson // 3D interpolate to quadrature points 597ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 598241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 599ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 60018d499f1SYohann CeedScalar r_t1[T1d]; 60118d499f1SYohann CeedScalar r_t2[T1d]; 602ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 603241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 604241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 605241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 606241a4b83SYohann } 607241a4b83SYohann } 608241a4b83SYohann 609ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 610ab213215SJeremy L Thompson // 3D interpolate transpose 611ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 612241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 613ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 61418d499f1SYohann CeedScalar r_t1[T1d]; 61518d499f1SYohann CeedScalar r_t2[T1d]; 616ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 617241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 618241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 619241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 620241a4b83SYohann } 621241a4b83SYohann } 622241a4b83SYohann 623ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 624ab213215SJeremy L Thompson // 3D derivatives at quadrature points 625ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 626241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 627ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 62818d499f1SYohann CeedScalar r_t1[T1d]; 62918d499f1SYohann CeedScalar r_t2[T1d]; 630ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 631241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 632241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 633241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 634241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 635241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 636241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 637241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 638241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 639241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 640241a4b83SYohann } 641241a4b83SYohann } 642241a4b83SYohann 643ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 644ab213215SJeremy L Thompson // 3D derivatives transpose 645ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 646241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 647ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 64818d499f1SYohann CeedScalar r_t1[T1d]; 64918d499f1SYohann CeedScalar r_t2[T1d]; 650ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 651241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 652241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 653241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 654241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 655241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 656241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 657241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 658241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 659241a4b83SYohann ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 660241a4b83SYohann } 661241a4b83SYohann } 662241a4b83SYohann 663ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 664ab213215SJeremy L Thompson // 3D collocated derivatives computation 665ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 666ac421f39SYohann template <int NCOMP, int Q1d> 667ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 66818d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 669ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 67018d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 671ac421f39SYohann __syncthreads(); 672ac421f39SYohann // X derivative 673ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 674ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 67518d499f1SYohann r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 676ac421f39SYohann // Y derivative 677ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 678ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 67918d499f1SYohann r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 680ac421f39SYohann // Z derivative 681ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 682ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 683ab213215SJeremy L Thompson r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 684ac421f39SYohann __syncthreads(); 685ac421f39SYohann } 686ac421f39SYohann } 68718d499f1SYohann } 688ac421f39SYohann 689ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 690ab213215SJeremy L Thompson // 3D collocated derivatives transpose 691ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 692ac421f39SYohann template <int NCOMP, int Q1d> 693ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 69418d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 695ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 696ac421f39SYohann // X derivative 69718d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 698ac421f39SYohann __syncthreads(); 699ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 70018d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 701ac421f39SYohann __syncthreads(); 702ac421f39SYohann // Y derivative 70318d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 704ac421f39SYohann __syncthreads(); 705ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 70618d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 707ac421f39SYohann __syncthreads(); 708ac421f39SYohann // Z derivative 709ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 710ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 711ac421f39SYohann } 712ac421f39SYohann } 71318d499f1SYohann } 714ac421f39SYohann 715ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 716ab213215SJeremy L Thompson // 1D quadrature weights 717ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 718241a4b83SYohann template <int Q1d> 719241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 72018d499f1SYohann *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 721241a4b83SYohann } 722241a4b83SYohann 723ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 724ab213215SJeremy L Thompson // 2D quadrature weights 725ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 726241a4b83SYohann template <int Q1d> 727241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 72818d499f1SYohann *w = (data.tidx < Q1d && data.tidy < Q1d) ? 72918d499f1SYohann qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 730241a4b83SYohann } 731241a4b83SYohann 732ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 733ab213215SJeremy L Thompson // 3D quadrature weights 734ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 735241a4b83SYohann template <int Q1d> 736241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 73718d499f1SYohann const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 73818d499f1SYohann const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 739ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 74018d499f1SYohann w[z] = quad ? pw*qweight1d[z] : 0.0; 741241a4b83SYohann } 742241a4b83SYohann 743241a4b83SYohann ); 744ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 745ab213215SJeremy L Thompson // Build singe operator kernel 746ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 747241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 748241a4b83SYohann 749241a4b83SYohann using std::ostringstream; 750241a4b83SYohann using std::string; 751241a4b83SYohann int ierr; 752241a4b83SYohann bool setupdone; 753fd364f38SJeremy L Thompson ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr); 754241a4b83SYohann if (setupdone) return 0; 755241a4b83SYohann Ceed ceed; 756241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 757241a4b83SYohann CeedOperator_Cuda_gen *data; 7586c845298Sjeremylt ierr = CeedOperatorGetData(op, &data); CeedChk(ierr); 759241a4b83SYohann CeedQFunction qf; 760241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 761241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 7626c845298Sjeremylt ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr); 7631da99368SJeremy L Thompson CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 7645c7b696cSjeremylt numoutputfields, ncomp, dim = 0, lsize; 765241a4b83SYohann ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 766241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 767241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 768241a4b83SYohann CeedChk(ierr); 769241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 770241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 771241a4b83SYohann CeedChk(ierr); 772241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 773241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 774241a4b83SYohann CeedChk(ierr); 775241a4b83SYohann CeedEvalMode emode; 776241a4b83SYohann CeedBasis basis; 777241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 778241a4b83SYohann CeedElemRestriction Erestrict; 779*461525f5SNatalie Beams CeedElemRestriction_Cuda *restr_data; 780241a4b83SYohann 781241a4b83SYohann ostringstream code; 782241a4b83SYohann string devFunctions(deviceFunctions); 783241a4b83SYohann 784f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 785f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 786f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 7876c845298Sjeremylt ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr); 788f1a13f77SYohann Dudouit ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 789f1a13f77SYohann Dudouit if (prop.major<6){ 790f1a13f77SYohann Dudouit code << atomicAdd; 791f1a13f77SYohann Dudouit } 792f1a13f77SYohann Dudouit 793241a4b83SYohann code << devFunctions; 794241a4b83SYohann 795241a4b83SYohann string qFunction(qf_data->qFunctionSource); 796c3c443faSYohann Dudouit string qFunctionName(qf_data->qFunctionName); 797c3c443faSYohann Dudouit string oper; 79814cce66cSYohann Dudouit oper = "CeedKernel_Cuda_gen_" + qFunctionName; 7994d537eeaSYohann 8004d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 801ee07ded2SValeria Barra code << "\n#define CeedPragmaSIMD\n"; 8021da99368SJeremy L Thompson 8031da99368SJeremy L Thompson // Find dim and Q1d 80464d3f0c0Sjeremylt bool useCollograd = true; 80518d499f1SYohann data->maxP1d = 0; 8061da99368SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 8071da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 8081da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 8096c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 8100f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 8110f54b25eSjeremylt CeedChk(ierr); 8121da99368SJeremy L Thompson 8131da99368SJeremy L Thompson // Check for collocated gradient 81418d499f1SYohann useCollograd = useCollograd && basis_data->d_collograd1d; 8151da99368SJeremy L Thompson 8161da99368SJeremy L Thompson // Collect dim and Q1d 8171da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 8181da99368SJeremy L Thompson bool isTensor; 819fd364f38SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 8201da99368SJeremy L Thompson if (isTensor) { 8211da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 82218d499f1SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 82318d499f1SYohann if (P1d>data->maxP1d) data->maxP1d = P1d; 8241da99368SJeremy L Thompson } else { 825e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 8261da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 827e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8281da99368SJeremy L Thompson } 8291da99368SJeremy L Thompson } 8301da99368SJeremy L Thompson } 8311da99368SJeremy L Thompson // Check output bases for Q1d, dim as well 8321da99368SJeremy L Thompson // The only imput basis might be CEED_BASIS_COLLOCATED 8331da99368SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 8341da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 8350f54b25eSjeremylt 8361da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 8376c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 8380f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 8390f54b25eSjeremylt CeedChk(ierr); 8400f54b25eSjeremylt 8411da99368SJeremy L Thompson // Collect dim and Q1d 8421da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 8431da99368SJeremy L Thompson bool isTensor; 844fd364f38SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 8451da99368SJeremy L Thompson if (isTensor) { 8461da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 8471da99368SJeremy L Thompson } else { 848e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 8491da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 850e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8511da99368SJeremy L Thompson } 8520f54b25eSjeremylt 8530f54b25eSjeremylt // Check for collocated gradient 85464d3f0c0Sjeremylt useCollograd = useCollograd && basis_data->d_collograd1d; 8551da99368SJeremy L Thompson } 8561da99368SJeremy L Thompson } 8571da99368SJeremy L Thompson data->dim = dim; 8581da99368SJeremy L Thompson data->Q1d = Q1d; 8591da99368SJeremy L Thompson 8601da99368SJeremy L Thompson // Define CEED_Q_VLA 86164d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 8621da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA 1\n\n"; 8631da99368SJeremy L Thompson } else { 8641da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 8651da99368SJeremy L Thompson } 8661da99368SJeremy L Thompson 867241a4b83SYohann code << qFunction; 868241a4b83SYohann 869241a4b83SYohann // Setup 870818e0025SJeremy L Thompson code << "\n// -----------------------------------------------------------------------------\n"; 871c3c443faSYohann Dudouit code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 872241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 873241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 874241a4b83SYohann CeedChk(ierr); 8751da99368SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 876241a4b83SYohann code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 877241a4b83SYohann } 878241a4b83SYohann } 879241a4b83SYohann 880241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 881241a4b83SYohann code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 882241a4b83SYohann } 8831da99368SJeremy L Thompson 884241a4b83SYohann code << " const CeedInt Dim = "<<dim<<";\n"; 885241a4b83SYohann code << " const CeedInt Q1d = "<<Q1d<<";\n"; 8861da99368SJeremy L Thompson 887241a4b83SYohann code << " extern __shared__ CeedScalar slice[];\n"; 888241a4b83SYohann code << " BackendData data;\n"; 889241a4b83SYohann code << " data.tidx = threadIdx.x;\n"; 890241a4b83SYohann code << " data.tidy = threadIdx.y;\n"; 891241a4b83SYohann code << " data.tidz = threadIdx.z;\n"; 892241a4b83SYohann code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 89318d499f1SYohann code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 894920dcdc4Sjeremylt 895818e0025SJeremy L Thompson code << "\n // -- Input field constants and basis data --\n"; 896ac421f39SYohann //Initialize constants, and matrices B and G 897241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 898818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 899241a4b83SYohann // Get elemsize, emode, ncomp 900241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 901241a4b83SYohann CeedChk(ierr); 902241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 903241a4b83SYohann CeedChk(ierr); 904241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 905241a4b83SYohann CeedChk(ierr); 9064d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 907241a4b83SYohann CeedChk(ierr); 908920dcdc4Sjeremylt 909920dcdc4Sjeremylt // Set field constants 910920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT) { 911920dcdc4Sjeremylt ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 912920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 913920dcdc4Sjeremylt ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 914920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 915920dcdc4Sjeremylt } else { 916920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 917920dcdc4Sjeremylt } 918920dcdc4Sjeremylt code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 919920dcdc4Sjeremylt } 920920dcdc4Sjeremylt 921920dcdc4Sjeremylt // Load basis data 922920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 923241a4b83SYohann switch (emode) { 924241a4b83SYohann case CEED_EVAL_NONE: 925241a4b83SYohann break; 926241a4b83SYohann case CEED_EVAL_INTERP: 9276c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 928241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 929241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 930241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 931241a4b83SYohann break; 932241a4b83SYohann case CEED_EVAL_GRAD: 9336c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 934241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 935241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 936241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 93764d3f0c0Sjeremylt if (useCollograd) { 938ac421f39SYohann data->G.in[i] = basis_data->d_collograd1d; 939ac421f39SYohann code << " __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 940ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 941ac421f39SYohann } else { 942ac421f39SYohann data->G.in[i] = basis_data->d_grad1d; 943ac421f39SYohann code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 944241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 945ac421f39SYohann } 946241a4b83SYohann break; 947241a4b83SYohann case CEED_EVAL_WEIGHT: 948241a4b83SYohann break; // No action 949241a4b83SYohann case CEED_EVAL_DIV: 950241a4b83SYohann break; // TODO: Not implemented 951241a4b83SYohann case CEED_EVAL_CURL: 952241a4b83SYohann break; // TODO: Not implemented 953241a4b83SYohann } 954241a4b83SYohann } 955920dcdc4Sjeremylt 956818e0025SJeremy L Thompson code << "\n // -- Output field constants and basis data --\n"; 957241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 958818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 959241a4b83SYohann // Get elemsize, emode, ncomp 960241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 961241a4b83SYohann CeedChk(ierr); 962241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 963241a4b83SYohann CeedChk(ierr); 964241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 965241a4b83SYohann CeedChk(ierr); 9664d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 967241a4b83SYohann CeedChk(ierr); 968920dcdc4Sjeremylt 969920dcdc4Sjeremylt // Set field constants 970241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 971920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 972241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 973241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 974920dcdc4Sjeremylt } else { 975920dcdc4Sjeremylt code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 976920dcdc4Sjeremylt } 977920dcdc4Sjeremylt code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 978920dcdc4Sjeremylt 979920dcdc4Sjeremylt // Load basis data 980920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 981920dcdc4Sjeremylt switch (emode) { 982920dcdc4Sjeremylt case CEED_EVAL_NONE: 983920dcdc4Sjeremylt break; // No action 984920dcdc4Sjeremylt case CEED_EVAL_INTERP: 9856c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 986241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 987241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 988241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 989241a4b83SYohann break; 990241a4b83SYohann case CEED_EVAL_GRAD: 9916c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 992241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 993241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 994241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 99564d3f0c0Sjeremylt if (useCollograd) { 996ac421f39SYohann data->G.out[i] = basis_data->d_collograd1d; 997ac421f39SYohann code << " __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 998ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 999ac421f39SYohann } else { 1000ac421f39SYohann data->G.out[i] = basis_data->d_grad1d; 1001ac421f39SYohann code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1002241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1003ac421f39SYohann } 1004241a4b83SYohann break; 1005e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1006241a4b83SYohann case CEED_EVAL_WEIGHT: { 1007241a4b83SYohann Ceed ceed; 1008241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1009241a4b83SYohann return CeedError(ceed, 1, 1010241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1011241a4b83SYohann break; // Should not occur 1012241a4b83SYohann } 1013241a4b83SYohann case CEED_EVAL_DIV: 1014241a4b83SYohann break; // TODO: Not implemented 1015241a4b83SYohann case CEED_EVAL_CURL: 1016241a4b83SYohann break; // TODO: Not implemented 1017e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1018241a4b83SYohann } 1019241a4b83SYohann } 1020818e0025SJeremy L Thompson code << "\n // -- Element loop --\n"; 1021ac421f39SYohann code << " __syncthreads();\n"; 1022241a4b83SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1023241a4b83SYohann // Input basis apply if needed 1024ac421f39SYohann // Generate the correct eval mode code for each input 1025818e0025SJeremy L Thompson code << " // -- Input field restrictions and basis actions --\n"; 1026241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1027818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1028241a4b83SYohann // Get elemsize, emode, ncomp 1029241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1030241a4b83SYohann CeedChk(ierr); 1031241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1032241a4b83SYohann CeedChk(ierr); 1033241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1034241a4b83SYohann CeedChk(ierr); 10354d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1036241a4b83SYohann CeedChk(ierr); 1037920dcdc4Sjeremylt 1038920dcdc4Sjeremylt // Restriction 1039920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT && 104064d3f0c0Sjeremylt !((emode == CEED_EVAL_NONE) && useCollograd)) { 1041241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 10429a2291e3SJeremy L Thompson 10439a2291e3SJeremy L Thompson bool isStrided; 1044fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 10459a2291e3SJeremy L Thompson if (!isStrided) { 10465c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 10475c7b696cSjeremylt CeedChk(ierr); 10485c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 10495c7b696cSjeremylt CeedInt compstride; 10505c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 10515c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 10526c845298Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 10539a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 10545c7b696cSjeremylt code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1055ccedf6b0Sjeremylt } else { 1056920dcdc4Sjeremylt bool backendstrides; 1057fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1058920dcdc4Sjeremylt CeedChk(ierr); 105913585805SJeremy L Thompson CeedInt nelem; 106013585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 106113585805SJeremy L Thompson CeedChk(ierr); 106213585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1063920dcdc4Sjeremylt if (!backendstrides) { 1064920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1065920dcdc4Sjeremylt CeedChk(ierr); 1066ccedf6b0Sjeremylt } 1067920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1068d80fc06aSjeremylt code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1069920dcdc4Sjeremylt } 1070920dcdc4Sjeremylt } 1071920dcdc4Sjeremylt 1072920dcdc4Sjeremylt // Basis action 1073920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1074920dcdc4Sjeremylt switch (emode) { 1075920dcdc4Sjeremylt case CEED_EVAL_NONE: 107664d3f0c0Sjeremylt if (!useCollograd) { 1077920dcdc4Sjeremylt code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1078920dcdc4Sjeremylt } 1079920dcdc4Sjeremylt break; 1080920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1081241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1082241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1083241a4b83SYohann break; 1084241a4b83SYohann case CEED_EVAL_GRAD: 108564d3f0c0Sjeremylt if (useCollograd) { 1086ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1087ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1088ac421f39SYohann } else { 1089241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1090241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1091ac421f39SYohann } 1092241a4b83SYohann break; 1093241a4b83SYohann case CEED_EVAL_WEIGHT: 1094241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1095241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 10966c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 1097241a4b83SYohann data->W = basis_data->d_qweight1d; 1098241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1099241a4b83SYohann break; // No action 1100241a4b83SYohann case CEED_EVAL_DIV: 1101241a4b83SYohann break; // TODO: Not implemented 1102241a4b83SYohann case CEED_EVAL_CURL: 1103241a4b83SYohann break; // TODO: Not implemented 1104241a4b83SYohann } 1105241a4b83SYohann } 1106ac421f39SYohann 1107241a4b83SYohann // Q function 1108818e0025SJeremy L Thompson code << "\n // -- Output field setup --\n"; 1109241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1110818e0025SJeremy L Thompson code << "\n // ---- Output field "<<i<<" ----\n"; 1111241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1112241a4b83SYohann CeedChk(ierr); 1113241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1114241a4b83SYohann { 111564d3f0c0Sjeremylt if (useCollograd) { 1116ac421f39SYohann //Accumulator for gradient slices 1117ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1118ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1119ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1120ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1121ac421f39SYohann code << " }\n"; 1122ac421f39SYohann code << " }\n"; 1123ac421f39SYohann } else { 1124241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1125241a4b83SYohann } 1126ac421f39SYohann } 1127241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1128241a4b83SYohann { 1129241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1130241a4b83SYohann } 1131241a4b83SYohann } 1132ac421f39SYohann // We treat quadrature points per slice in 3d to save registers 113364d3f0c0Sjeremylt if (useCollograd) { 1134920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1135ac421f39SYohann code << "#pragma unroll\n"; 1136ac421f39SYohann code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1137818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1138ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1139818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1140ac421f39SYohann // Get elemsize, emode, ncomp 1141ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1142ac421f39SYohann CeedChk(ierr); 1143ac421f39SYohann // Basis action 1144920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1145ac421f39SYohann switch (emode) { 1146ac421f39SYohann case CEED_EVAL_NONE: 1147ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 11489a2291e3SJeremy L Thompson 11499a2291e3SJeremy L Thompson bool isStrided; 1150792ff326SYohann Dudouit ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr); 115113585805SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr); 1152fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 11539a2291e3SJeremy L Thompson if (!isStrided) { 11545c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 11555c7b696cSjeremylt CeedChk(ierr); 11565c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 11575c7b696cSjeremylt CeedInt compstride; 11585c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 11595c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 11606c845298Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 11619a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 11625c7b696cSjeremylt code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1163920dcdc4Sjeremylt } else { 1164920dcdc4Sjeremylt bool backendstrides; 1165fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1166920dcdc4Sjeremylt CeedChk(ierr); 116713585805SJeremy L Thompson CeedInt nelem; 116813585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 116913585805SJeremy L Thompson CeedChk(ierr); 117013585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1171920dcdc4Sjeremylt if (!backendstrides) { 1172920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1173920dcdc4Sjeremylt CeedChk(ierr); 1174920dcdc4Sjeremylt } 1175920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1176d80fc06aSjeremylt code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1177920dcdc4Sjeremylt } 1178ac421f39SYohann break; 1179ac421f39SYohann case CEED_EVAL_INTERP: 1180ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1181ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1182ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1183ac421f39SYohann code << " }\n"; 1184ac421f39SYohann break; 1185ac421f39SYohann case CEED_EVAL_GRAD: 1186ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1187ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1188ac421f39SYohann break; 1189ac421f39SYohann case CEED_EVAL_WEIGHT: 1190ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1191ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1192ac421f39SYohann break; // No action 1193ac421f39SYohann case CEED_EVAL_DIV: 1194ac421f39SYohann break; // TODO: Not implemented 1195ac421f39SYohann case CEED_EVAL_CURL: 1196ac421f39SYohann break; // TODO: Not implemented 1197ac421f39SYohann } 1198ac421f39SYohann } 1199818e0025SJeremy L Thompson code << "\n // -- Output fields --\n"; 1200ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1201818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1202ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1203ac421f39SYohann CeedChk(ierr); 1204ac421f39SYohann // Basis action 1205ac421f39SYohann switch (emode) { 1206ac421f39SYohann case CEED_EVAL_NONE: 1207ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1208ac421f39SYohann break; // No action 1209ac421f39SYohann case CEED_EVAL_INTERP: 1210ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1211ac421f39SYohann break; 1212ac421f39SYohann case CEED_EVAL_GRAD: 1213ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1214ac421f39SYohann break; 1215ac421f39SYohann case CEED_EVAL_WEIGHT: 1216ac421f39SYohann break; // Should not occur 1217ac421f39SYohann case CEED_EVAL_DIV: 1218ac421f39SYohann break; // TODO: Not implemented 1219ac421f39SYohann case CEED_EVAL_CURL: 1220ac421f39SYohann break; // TODO: Not implemented 1221ac421f39SYohann } 1222ac421f39SYohann } 1223ac421f39SYohann } else { 1224920dcdc4Sjeremylt code << "\n // Note: No Collocated Gradient\n"; 1225818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1226ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1227818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1228ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1229ac421f39SYohann } 1230818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1231ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1232818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1233ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1234ac421f39SYohann } 1235ac421f39SYohann } 1236818e0025SJeremy L Thompson code << "\n // -- QFunction Inputs and outputs --\n"; 12374d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 12384d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1239818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1240ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 12414d537eeaSYohann } 12424d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 12434d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1244818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1245ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 12464d537eeaSYohann } 1247818e0025SJeremy L Thompson code << "\n // -- Apply QFunction --\n"; 1248ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 124964d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 1250ac421f39SYohann code << "1"; 1251ac421f39SYohann } else { 1252ac421f39SYohann code << "Q1d"; 1253ac421f39SYohann } 1254ac421f39SYohann code << ", in, out);\n"; 125564d3f0c0Sjeremylt if (useCollograd) { 1256920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1257818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1258ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1259818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1260ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1261ac421f39SYohann CeedChk(ierr); 1262ac421f39SYohann // Basis action 1263920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1264ac421f39SYohann switch (emode) { 1265ac421f39SYohann case CEED_EVAL_NONE: 1266ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1267ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1268ac421f39SYohann code << " }\n"; 1269ac421f39SYohann break; // No action 1270ac421f39SYohann case CEED_EVAL_INTERP: 1271ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1272ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1273ac421f39SYohann code << " }\n"; 1274ac421f39SYohann break; 1275ac421f39SYohann case CEED_EVAL_GRAD: 1276ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1277ac421f39SYohann break; 1278ac421f39SYohann case CEED_EVAL_WEIGHT: 1279ac421f39SYohann break; // Should not occur 1280ac421f39SYohann case CEED_EVAL_DIV: 1281ac421f39SYohann break; // TODO: Not implemented 1282ac421f39SYohann case CEED_EVAL_CURL: 1283ac421f39SYohann break; // TODO: Not implemented 1284ac421f39SYohann } 1285ac421f39SYohann } 1286ac421f39SYohann code << " }\n"; 1287ac421f39SYohann } 1288241a4b83SYohann 1289241a4b83SYohann // Output basis apply if needed 1290ac421f39SYohann // Generate the correct eval mode code for each output 1291818e0025SJeremy L Thompson code << "\n // -- Output field basis action and restrictions --\n"; 1292241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1293818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1294241a4b83SYohann // Get elemsize, emode, ncomp 1295241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1296241a4b83SYohann CeedChk(ierr); 1297241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1298241a4b83SYohann CeedChk(ierr); 1299241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1300241a4b83SYohann CeedChk(ierr); 13014d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1302241a4b83SYohann CeedChk(ierr); 1303241a4b83SYohann // Basis action 1304920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1305241a4b83SYohann switch (emode) { 1306241a4b83SYohann case CEED_EVAL_NONE: 1307920dcdc4Sjeremylt code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1308241a4b83SYohann break; // No action 1309241a4b83SYohann case CEED_EVAL_INTERP: 1310241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1311241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1312241a4b83SYohann break; 1313241a4b83SYohann case CEED_EVAL_GRAD: 1314241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 131564d3f0c0Sjeremylt if (useCollograd) { 1316ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1317ac421f39SYohann } else { 1318241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1319ac421f39SYohann } 1320241a4b83SYohann break; 1321e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1322241a4b83SYohann case CEED_EVAL_WEIGHT: { 1323241a4b83SYohann Ceed ceed; 1324241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1325241a4b83SYohann return CeedError(ceed, 1, 1326241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1327241a4b83SYohann break; // Should not occur 1328241a4b83SYohann } 1329241a4b83SYohann case CEED_EVAL_DIV: 1330241a4b83SYohann break; // TODO: Not implemented 1331241a4b83SYohann case CEED_EVAL_CURL: 1332241a4b83SYohann break; // TODO: Not implemented 1333e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1334241a4b83SYohann } 1335920dcdc4Sjeremylt // Restriction 13369a2291e3SJeremy L Thompson bool isStrided; 1337fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 13389a2291e3SJeremy L Thompson if (!isStrided) { 13395c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 13405c7b696cSjeremylt CeedChk(ierr); 13415c7b696cSjeremylt code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 13425c7b696cSjeremylt CeedInt compstride; 13435c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 13445c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 13456c845298Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 13469a2291e3SJeremy L Thompson data->indices.out[i] = restr_data->d_ind; 13475c7b696cSjeremylt code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1348920dcdc4Sjeremylt } else { 1349920dcdc4Sjeremylt bool backendstrides; 1350fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1351920dcdc4Sjeremylt CeedChk(ierr); 135213585805SJeremy L Thompson CeedInt nelem; 135313585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 135413585805SJeremy L Thompson CeedChk(ierr); 135513585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1356920dcdc4Sjeremylt if (!backendstrides) { 1357920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1358920dcdc4Sjeremylt CeedChk(ierr); 1359920dcdc4Sjeremylt } 1360920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1361d80fc06aSjeremylt code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1362920dcdc4Sjeremylt } 1363241a4b83SYohann } 1364241a4b83SYohann 1365241a4b83SYohann code << " }\n"; 1366818e0025SJeremy L Thompson code << "}\n"; 1367818e0025SJeremy L Thompson code << "// -----------------------------------------------------------------------------\n\n"; 1368241a4b83SYohann 1369ab213215SJeremy L Thompson // View kernel for debugging 1370b8e71988SJeremy L Thompson CeedDebug(code.str().c_str()); 1371241a4b83SYohann 137218d499f1SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 137318d499f1SYohann "T1d", CeedIntMax(Q1d, data->maxP1d)); 1374920dcdc4Sjeremylt CeedChk(ierr); 1375c3c443faSYohann Dudouit ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1376241a4b83SYohann CeedChk(ierr); 1377241a4b83SYohann 1378241a4b83SYohann ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1379241a4b83SYohann return 0; 1380241a4b83SYohann } 1381ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1382