1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 163d576824SJeremy L Thompson 17b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12 187df94212SJeremy L Thompson 19ec3da8bcSJed Brown #include <ceed/ceed.h> 20ec3da8bcSJed Brown #include <ceed/backend.h> 213d576824SJeremy L Thompson #include <cuda_runtime.h> 22241a4b83SYohann #include <iostream> 23241a4b83SYohann #include <sstream> 243d576824SJeremy L Thompson #include "ceed-cuda-gen.h" 25*6d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 26241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 27241a4b83SYohann 28f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 29ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 30ab213215SJeremy L Thompson // Atomic add, for older CUDA 31ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3280a9ef05SNatalie Beams __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) { 33241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 34241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 35241a4b83SYohann do { 36241a4b83SYohann assumed = old; 37241a4b83SYohann old = 38241a4b83SYohann atomicCAS(address_as_ull, assumed, 39241a4b83SYohann __double_as_longlong(val + 40241a4b83SYohann __longlong_as_double(assumed))); 41241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 42241a4b83SYohann // (since NaN != NaN) 43241a4b83SYohann } while (assumed != old); 44241a4b83SYohann return __longlong_as_double(old); 45241a4b83SYohann } 46f1a13f77SYohann Dudouit ); 47f1a13f77SYohann Dudouit 48f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 49f1a13f77SYohann Dudouit 50ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 51ab213215SJeremy L Thompson // Typedefs 52ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 53f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 54f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 55f1a13f77SYohann Dudouit 56f1a13f77SYohann Dudouit typedef struct { 57f1a13f77SYohann Dudouit CeedInt tidx; 58f1a13f77SYohann Dudouit CeedInt tidy; 59f1a13f77SYohann Dudouit CeedInt tidz; 60f1a13f77SYohann Dudouit CeedInt tid; 61f1a13f77SYohann Dudouit CeedScalar* slice; 62f1a13f77SYohann Dudouit } BackendData; 63241a4b83SYohann 64ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 65ab213215SJeremy L Thompson // Load matrices for basis actions 66ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 67241a4b83SYohann template <int P, int Q> 68053543deSYohann Dudouit inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 69ab213215SJeremy L Thompson for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 70241a4b83SYohann B[i] = d_B[i]; 71241a4b83SYohann } 72241a4b83SYohann 73ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 74241a4b83SYohann // 1D 75ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 76ab213215SJeremy L Thompson 77ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 78ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 79ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 805c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 81053543deSYohann Dudouit inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 82ab213215SJeremy L Thompson if (data.tidx < P1d) { 834d537eeaSYohann const CeedInt node = data.tidx; 84ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 85ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 865c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 87241a4b83SYohann } 88241a4b83SYohann } 89920dcdc4Sjeremylt 90ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 91ab213215SJeremy L Thompson // L-vector -> E-vector, strided 92ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 93d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 94053543deSYohann Dudouit inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 95ab213215SJeremy L Thompson if (data.tidx < P1d) { 96ccedf6b0Sjeremylt const CeedInt node = data.tidx; 97d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 98ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 99d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 100ccedf6b0Sjeremylt } 101ccedf6b0Sjeremylt } 102241a4b83SYohann 103ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 104ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 105ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1065c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 107053543deSYohann Dudouit inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 108ab213215SJeremy L Thompson if (data.tidx < P1d) { 1094d537eeaSYohann const CeedInt node = data.tidx; 110ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 111ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 1125c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 113241a4b83SYohann } 114241a4b83SYohann } 115241a4b83SYohann 116ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 117ab213215SJeremy L Thompson // E-vector -> L-vector, strided 118ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 119d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 120d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 121ab213215SJeremy L Thompson if (data.tidx < P1d) { 122ccedf6b0Sjeremylt const CeedInt node = data.tidx; 123d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 124ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 125d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 126ccedf6b0Sjeremylt } 127ccedf6b0Sjeremylt } 128ccedf6b0Sjeremylt 129ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 130ab213215SJeremy L Thompson // 1D tensor contraction x 131ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 132241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 133ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 134241a4b83SYohann data.slice[data.tidx] = *U; 135241a4b83SYohann __syncthreads(); 136241a4b83SYohann *V = 0.0; 13718d499f1SYohann if (data.tidx < Q1d) 138ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 139ab213215SJeremy L Thompson *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 140241a4b83SYohann __syncthreads(); 141241a4b83SYohann } 142241a4b83SYohann 143ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 144ab213215SJeremy L Thompson // 1D transpose tensor contraction x 145ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 146241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 147ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 148241a4b83SYohann data.slice[data.tidx] = *U; 149241a4b83SYohann __syncthreads(); 150241a4b83SYohann *V = 0.0; 15118d499f1SYohann if (data.tidx < P1d) 152ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 153ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 154241a4b83SYohann __syncthreads(); 155241a4b83SYohann } 156241a4b83SYohann 157ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 158ab213215SJeremy L Thompson // 1D interpolate to quadrature points 159ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 160241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 161ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 162ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 163241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 164241a4b83SYohann } 165241a4b83SYohann 166ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 167ab213215SJeremy L Thompson // 1D interpolate transpose 168ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 169241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 170ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 171ab213215SJeremy L Thompson for (CeedInt comp=0; comp<NCOMP; comp++) 172241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 173241a4b83SYohann } 174241a4b83SYohann 175ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 176ab213215SJeremy L Thompson // 1D derivatives at quadrature points 177ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 178241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 179ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 180ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 181241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 182241a4b83SYohann } 183241a4b83SYohann 184ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 185ab213215SJeremy L Thompson // 1D derivatives transpose 186ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 187241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 188ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 189ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 190241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 191241a4b83SYohann } 192241a4b83SYohann 193ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 194241a4b83SYohann // 2D 195ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 196ab213215SJeremy L Thompson 197ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 198ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 199ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 2005c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 201053543deSYohann Dudouit inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 202ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 2034d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 204ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 205ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2065c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 207241a4b83SYohann } 208241a4b83SYohann } 209920dcdc4Sjeremylt 210ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 211ab213215SJeremy L Thompson // L-vector -> E-vector, strided 212ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 213d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 214053543deSYohann Dudouit inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 215ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 216ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 217d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 218ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 219d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 220ccedf6b0Sjeremylt } 221ccedf6b0Sjeremylt } 222241a4b83SYohann 223ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 224ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 225ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 2265c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 227053543deSYohann Dudouit inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 228ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 2294d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 230ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 231ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2325c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 233241a4b83SYohann } 234241a4b83SYohann } 235241a4b83SYohann 236ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 237ab213215SJeremy L Thompson // E-vector -> L-vector, strided 238ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 239d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 240d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 241ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 242ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 243d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 244ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 245d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 246ccedf6b0Sjeremylt } 247ccedf6b0Sjeremylt } 248ccedf6b0Sjeremylt 249ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 250ab213215SJeremy L Thompson // 2D tensor contraction x 251ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 252241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 253ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 25418d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 255241a4b83SYohann __syncthreads(); 256241a4b83SYohann *V = 0.0; 25718d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 258ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 25918d499f1SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 260241a4b83SYohann __syncthreads(); 261241a4b83SYohann } 262241a4b83SYohann 263ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 264ab213215SJeremy L Thompson // 2D tensor contract y 265ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 266241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 267ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 26818d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 269241a4b83SYohann __syncthreads(); 270241a4b83SYohann *V = 0.0; 27118d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 272ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 27318d499f1SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 274241a4b83SYohann __syncthreads(); 275241a4b83SYohann } 276241a4b83SYohann 277ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 278ab213215SJeremy L Thompson // 2D transpose tensor contract y 279ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 280241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 281ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 28218d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 283241a4b83SYohann __syncthreads(); 284241a4b83SYohann *V = 0.0; 28518d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 286ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 28718d499f1SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 288241a4b83SYohann __syncthreads(); 289241a4b83SYohann } 290241a4b83SYohann 291ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 292ab213215SJeremy L Thompson // 2D transpose tensor contract x 293ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 294241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 295ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 29618d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 297241a4b83SYohann __syncthreads(); 298241a4b83SYohann *V = 0.0; 29918d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 300ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 30118d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 302241a4b83SYohann __syncthreads(); 303241a4b83SYohann } 304241a4b83SYohann 305ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 306ab213215SJeremy L Thompson // 2D transpose tensor contract and add x 307ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 308241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 309ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 31018d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 311241a4b83SYohann __syncthreads(); 31218d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 313ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 31418d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 315241a4b83SYohann __syncthreads(); 316241a4b83SYohann } 317241a4b83SYohann 318ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 319ab213215SJeremy L Thompson // 2D interpolate to quadrature points 320ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 321241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 322ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 323241a4b83SYohann CeedScalar r_t[1]; 324ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 325241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 326241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 327241a4b83SYohann } 328241a4b83SYohann } 329241a4b83SYohann 330ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 331ab213215SJeremy L Thompson // 2D interpolate transpose 332ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 333241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 334ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 335241a4b83SYohann CeedScalar r_t[1]; 336ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 337241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 338241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 339241a4b83SYohann } 340241a4b83SYohann } 341241a4b83SYohann 342ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 343ab213215SJeremy L Thompson // 2D derivatives at quadrature points 344ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 345241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 346ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 347241a4b83SYohann CeedScalar r_t[1]; 348ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 349241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 350241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 351241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 352241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 353241a4b83SYohann } 354241a4b83SYohann } 355241a4b83SYohann 356ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 357ab213215SJeremy L Thompson // 2D derivatives transpose 358ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 359241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 360ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 361241a4b83SYohann CeedScalar r_t[1]; 362ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 363241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 364241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 365241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 366241a4b83SYohann ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 367241a4b83SYohann } 368241a4b83SYohann } 369241a4b83SYohann 370ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 371241a4b83SYohann // 3D 372ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 373ab213215SJeremy L Thompson 374ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 375ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 376ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3775c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 378053543deSYohann Dudouit inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 379ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 380241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3814d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 382ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 383ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 3845c7b696cSjeremylt r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 385241a4b83SYohann } 386241a4b83SYohann } 387920dcdc4Sjeremylt 388ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 389ab213215SJeremy L Thompson // L-vector -> E-vector, strided 390ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 391d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 392053543deSYohann Dudouit inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 393ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 394ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 395ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 396d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 397ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 398d80fc06aSjeremylt r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 399ccedf6b0Sjeremylt } 400ccedf6b0Sjeremylt } 401241a4b83SYohann 402ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 403ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided 404ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4055c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d> 406053543deSYohann Dudouit inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 40718d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 408ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 409920dcdc4Sjeremylt const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 410ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4115c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 412ac421f39SYohann } 41318d499f1SYohann } 414ac421f39SYohann 415ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 416ab213215SJeremy L Thompson // E-vector -> Q-vector, strided 417ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 418d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 419053543deSYohann Dudouit inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 42018d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 421920dcdc4Sjeremylt const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 422d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 423ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 424d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 425920dcdc4Sjeremylt } 42618d499f1SYohann } 427920dcdc4Sjeremylt 428ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 429ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 430ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4315c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 432053543deSYohann Dudouit inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 433ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 434241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4354d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 436ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 437ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4385c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 439241a4b83SYohann } 440241a4b83SYohann } 441241a4b83SYohann 442ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 443ab213215SJeremy L Thompson // E-vector -> L-vector, strided 444ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 445d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4462f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 447ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 448ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 449ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 450d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 451ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 452d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 453ccedf6b0Sjeremylt } 454ccedf6b0Sjeremylt } 455ccedf6b0Sjeremylt 456ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 457ab213215SJeremy L Thompson // 3D tensor contract x 458ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 459241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 460ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 461ac421f39SYohann CeedScalar r_B[P1d]; 462ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 463ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 464ab213215SJeremy L Thompson 465ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 46618d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 467241a4b83SYohann __syncthreads(); 468241a4b83SYohann V[k] = 0.0; 46918d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 470ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 47118d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 472241a4b83SYohann __syncthreads(); 473241a4b83SYohann } 474241a4b83SYohann } 475241a4b83SYohann 476ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 477ab213215SJeremy L Thompson // 3D tensor contract y 478ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 479241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 480ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 481ac421f39SYohann CeedScalar r_B[P1d]; 482ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 483ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 484ab213215SJeremy L Thompson 485ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 48618d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 487241a4b83SYohann __syncthreads(); 488241a4b83SYohann V[k] = 0.0; 48918d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 490ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 49118d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 492241a4b83SYohann __syncthreads(); 493241a4b83SYohann } 494241a4b83SYohann } 495241a4b83SYohann 496ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 497ab213215SJeremy L Thompson // 3D tensor contract z 498ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 499241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 500ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 501ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 502241a4b83SYohann V[k] = 0.0; 50318d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 504ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 505ab213215SJeremy L Thompson V[k] += B[i + k*P1d] * U[i]; // Contract z direction 506241a4b83SYohann } 507241a4b83SYohann } 508241a4b83SYohann 509ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 510ab213215SJeremy L Thompson // 3D transpose tensor contract z 511ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 512241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 513ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 51418d499f1SYohann for (CeedInt k = 0; k < P1d; ++k) { 515241a4b83SYohann V[k] = 0.0; 51618d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 517ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 518ab213215SJeremy L Thompson V[k] += B[k + i*P1d] * U[i]; // Contract z direction 519241a4b83SYohann } 520241a4b83SYohann } 521241a4b83SYohann 522ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 523ab213215SJeremy L Thompson // 3D transpose tensor contract y 524ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 525241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 526ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 527ac421f39SYohann CeedScalar r_B[Q1d]; 528ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 529ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 530ab213215SJeremy L Thompson 531ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 53218d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 533241a4b83SYohann __syncthreads(); 534241a4b83SYohann V[k] = 0.0; 53518d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 536ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 53718d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 538ac421f39SYohann __syncthreads(); 539ac421f39SYohann } 540ac421f39SYohann } 541ac421f39SYohann 542ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 543ab213215SJeremy L Thompson // 3D transpose tensor contract add y 544ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 545ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 546ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 547ac421f39SYohann CeedScalar r_B[Q1d]; 548ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 549ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 550ab213215SJeremy L Thompson 551ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 55218d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 553ac421f39SYohann __syncthreads(); 55418d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 555ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 55618d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 557241a4b83SYohann __syncthreads(); 558241a4b83SYohann } 559241a4b83SYohann } 560241a4b83SYohann 561ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 562ab213215SJeremy L Thompson // 3D transpose tensor contract x 563ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 564241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 565ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 566ac421f39SYohann CeedScalar r_B[Q1d]; 567ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 568ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 569ab213215SJeremy L Thompson 570ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 57118d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 572241a4b83SYohann __syncthreads(); 573241a4b83SYohann V[k] = 0.0; 57418d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 575ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 57618d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 577241a4b83SYohann __syncthreads(); 578241a4b83SYohann } 579241a4b83SYohann } 580241a4b83SYohann 581ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 582ab213215SJeremy L Thompson // 3D transpose tensor contract add x 583ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 584241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 585ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 586ac421f39SYohann CeedScalar r_B[Q1d]; 587ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 588ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 589ab213215SJeremy L Thompson 590ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 59118d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 592241a4b83SYohann __syncthreads(); 59318d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 594ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 59518d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 596241a4b83SYohann __syncthreads(); 597241a4b83SYohann } 598241a4b83SYohann } 599241a4b83SYohann 600ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 601ab213215SJeremy L Thompson // 3D interpolate to quadrature points 602ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 603241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 604ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 60518d499f1SYohann CeedScalar r_t1[T1d]; 60618d499f1SYohann CeedScalar r_t2[T1d]; 607ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 608241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 609241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 610241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 611241a4b83SYohann } 612241a4b83SYohann } 613241a4b83SYohann 614ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 615ab213215SJeremy L Thompson // 3D interpolate transpose 616ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 617241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 618ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 61918d499f1SYohann CeedScalar r_t1[T1d]; 62018d499f1SYohann CeedScalar r_t2[T1d]; 621ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 622241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 623241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 624241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 625241a4b83SYohann } 626241a4b83SYohann } 627241a4b83SYohann 628ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 629ab213215SJeremy L Thompson // 3D derivatives at quadrature points 630ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 631241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 632ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 63318d499f1SYohann CeedScalar r_t1[T1d]; 63418d499f1SYohann CeedScalar r_t2[T1d]; 635ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 636241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 637241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 638241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 639241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 640241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 641241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 642241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 643241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 644241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 645241a4b83SYohann } 646241a4b83SYohann } 647241a4b83SYohann 648ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 649ab213215SJeremy L Thompson // 3D derivatives transpose 650ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 651241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 652ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 65318d499f1SYohann CeedScalar r_t1[T1d]; 65418d499f1SYohann CeedScalar r_t2[T1d]; 655ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 656241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 657241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 658241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 659241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 660241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 661241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 662241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 663241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 664241a4b83SYohann ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 665241a4b83SYohann } 666241a4b83SYohann } 667241a4b83SYohann 668ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 669ab213215SJeremy L Thompson // 3D collocated derivatives computation 670ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 671ac421f39SYohann template <int NCOMP, int Q1d> 672ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 67318d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 674ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 67518d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 676ac421f39SYohann __syncthreads(); 677ac421f39SYohann // X derivative 678ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 679ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 68018d499f1SYohann r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 681ac421f39SYohann // Y derivative 682ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 683ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 68418d499f1SYohann r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 685ac421f39SYohann // Z derivative 686ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 687ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 688ab213215SJeremy L Thompson r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 689ac421f39SYohann __syncthreads(); 690ac421f39SYohann } 691ac421f39SYohann } 69218d499f1SYohann } 693ac421f39SYohann 694ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 695ab213215SJeremy L Thompson // 3D collocated derivatives transpose 696ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 697ac421f39SYohann template <int NCOMP, int Q1d> 698ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 69918d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 700ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 701ac421f39SYohann // X derivative 70218d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 703ac421f39SYohann __syncthreads(); 704ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 70518d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 706ac421f39SYohann __syncthreads(); 707ac421f39SYohann // Y derivative 70818d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 709ac421f39SYohann __syncthreads(); 710ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 71118d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 712ac421f39SYohann __syncthreads(); 713ac421f39SYohann // Z derivative 714ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 715ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 716ac421f39SYohann } 717ac421f39SYohann } 71818d499f1SYohann } 719ac421f39SYohann 720ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 721ab213215SJeremy L Thompson // 1D quadrature weights 722ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 723241a4b83SYohann template <int Q1d> 724053543deSYohann Dudouit inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 72518d499f1SYohann *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 726241a4b83SYohann } 727241a4b83SYohann 728ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 729ab213215SJeremy L Thompson // 2D quadrature weights 730ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 731241a4b83SYohann template <int Q1d> 732053543deSYohann Dudouit inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 73318d499f1SYohann *w = (data.tidx < Q1d && data.tidy < Q1d) ? 73418d499f1SYohann qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 735241a4b83SYohann } 736241a4b83SYohann 737ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 738ab213215SJeremy L Thompson // 3D quadrature weights 739ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 740241a4b83SYohann template <int Q1d> 741053543deSYohann Dudouit inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 74218d499f1SYohann const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 74318d499f1SYohann const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 744ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 74518d499f1SYohann w[z] = quad ? pw*qweight1d[z] : 0.0; 746241a4b83SYohann } 747241a4b83SYohann 748241a4b83SYohann ); 749ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 750ab213215SJeremy L Thompson // Build singe operator kernel 751ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 752241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 753241a4b83SYohann 754241a4b83SYohann using std::ostringstream; 755241a4b83SYohann using std::string; 756241a4b83SYohann int ierr; 757241a4b83SYohann bool setupdone; 758e15f9bd0SJeremy L Thompson ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 759e15f9bd0SJeremy L Thompson if (setupdone) return CEED_ERROR_SUCCESS; 760241a4b83SYohann Ceed ceed; 761e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 762241a4b83SYohann CeedOperator_Cuda_gen *data; 763e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 764241a4b83SYohann CeedQFunction qf; 765241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 766e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 767e15f9bd0SJeremy L Thompson ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 76888db6d3bSJeremy L Thompson CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields, 7695c7b696cSjeremylt numoutputfields, ncomp, dim = 0, lsize; 770e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 771e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 772241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 7737e7773b5SJeremy L Thompson ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields); 774e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 775241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 7767e7773b5SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 777e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 778241a4b83SYohann CeedEvalMode emode; 779241a4b83SYohann CeedBasis basis; 780241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 781241a4b83SYohann CeedElemRestriction Erestrict; 782461525f5SNatalie Beams CeedElemRestriction_Cuda *restr_data; 783241a4b83SYohann 7840b454692Sjeremylt // Check for restriction only identity operator 7850b454692Sjeremylt bool is_identity_qf; 7860b454692Sjeremylt ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 7870b454692Sjeremylt if (is_identity_qf) { 7880b454692Sjeremylt CeedEvalMode emodein, emodeout; 7890b454692Sjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein); CeedChkBackend(ierr); 7900b454692Sjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout); CeedChkBackend(ierr); 7910b454692Sjeremylt if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE) 7920b454692Sjeremylt // LCOV_EXCL_START 7930b454692Sjeremylt return CeedError(ceed, CEED_ERROR_BACKEND, 7940b454692Sjeremylt "Backend does not implement restriction only identity operators"); 7950b454692Sjeremylt // LCOV_EXCL_STOP 7960b454692Sjeremylt } 7970b454692Sjeremylt 798241a4b83SYohann ostringstream code; 799241a4b83SYohann string devFunctions(deviceFunctions); 800241a4b83SYohann 801f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 802f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 803f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 80488db6d3bSJeremy L Thompson ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr); 80588db6d3bSJeremy L Thompson ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); CeedChkBackend(ierr); 80680a9ef05SNatalie Beams if ((prop.major<6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){ 807f1a13f77SYohann Dudouit code << atomicAdd; 808f1a13f77SYohann Dudouit } 809f1a13f77SYohann Dudouit 810241a4b83SYohann code << devFunctions; 811241a4b83SYohann 812241a4b83SYohann string qFunction(qf_data->qFunctionSource); 813c3c443faSYohann Dudouit string qFunctionName(qf_data->qFunctionName); 814c3c443faSYohann Dudouit string oper; 81514cce66cSYohann Dudouit oper = "CeedKernel_Cuda_gen_" + qFunctionName; 8164d537eeaSYohann 8174d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 81877841947SLeila Ghaffari code << "#define CEED_QFUNCTION_HELPER inline __device__\n"; 819e15f9bd0SJeremy L Thompson code << "#define CeedPragmaSIMD\n"; 820e15f9bd0SJeremy L Thompson code << "#define CEED_ERROR_SUCCESS 0\n\n"; 8211da99368SJeremy L Thompson 8221da99368SJeremy L Thompson // Find dim and Q1d 82364d3f0c0Sjeremylt bool useCollograd = true; 82418d499f1SYohann data->maxP1d = 0; 8251da99368SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 826e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 8271da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 828e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 8290f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 830e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8311da99368SJeremy L Thompson 8321da99368SJeremy L Thompson // Check for collocated gradient 83318d499f1SYohann useCollograd = useCollograd && basis_data->d_collograd1d; 8341da99368SJeremy L Thompson 8351da99368SJeremy L Thompson // Collect dim and Q1d 836e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 8371da99368SJeremy L Thompson bool isTensor; 838e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 8391da99368SJeremy L Thompson if (isTensor) { 840e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 841e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 84218d499f1SYohann if (P1d>data->maxP1d) data->maxP1d = P1d; 8431da99368SJeremy L Thompson } else { 844e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 845e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 846e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8471da99368SJeremy L Thompson } 8481da99368SJeremy L Thompson } 8491da99368SJeremy L Thompson } 8501da99368SJeremy L Thompson // Check output bases for Q1d, dim as well 8511da99368SJeremy L Thompson // The only imput basis might be CEED_BASIS_COLLOCATED 8521da99368SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 853e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 8540f54b25eSjeremylt 8551da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 856e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 8570f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 858e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8590f54b25eSjeremylt 8601da99368SJeremy L Thompson // Collect dim and Q1d 861e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 8621da99368SJeremy L Thompson bool isTensor; 863e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 8641da99368SJeremy L Thompson if (isTensor) { 865e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 8661da99368SJeremy L Thompson } else { 867e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 868e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 869e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8701da99368SJeremy L Thompson } 8710f54b25eSjeremylt 8720f54b25eSjeremylt // Check for collocated gradient 87364d3f0c0Sjeremylt useCollograd = useCollograd && basis_data->d_collograd1d; 8741da99368SJeremy L Thompson } 8751da99368SJeremy L Thompson } 8761da99368SJeremy L Thompson data->dim = dim; 8771da99368SJeremy L Thompson data->Q1d = Q1d; 8781da99368SJeremy L Thompson 8791da99368SJeremy L Thompson // Define CEED_Q_VLA 88064d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 8811da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA 1\n\n"; 8821da99368SJeremy L Thompson } else { 8831da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 8841da99368SJeremy L Thompson } 8851da99368SJeremy L Thompson 886241a4b83SYohann code << qFunction; 887241a4b83SYohann 888241a4b83SYohann // Setup 889818e0025SJeremy L Thompson code << "\n// -----------------------------------------------------------------------------\n"; 890c3c443faSYohann Dudouit code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 891241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 892241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 893e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8941da99368SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 895241a4b83SYohann code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 896241a4b83SYohann } 897241a4b83SYohann } 898241a4b83SYohann 899241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 900241a4b83SYohann code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 901241a4b83SYohann } 9021da99368SJeremy L Thompson 903241a4b83SYohann code << " const CeedInt Dim = "<<dim<<";\n"; 904241a4b83SYohann code << " const CeedInt Q1d = "<<Q1d<<";\n"; 9051da99368SJeremy L Thompson 906241a4b83SYohann code << " extern __shared__ CeedScalar slice[];\n"; 907241a4b83SYohann code << " BackendData data;\n"; 908241a4b83SYohann code << " data.tidx = threadIdx.x;\n"; 909241a4b83SYohann code << " data.tidy = threadIdx.y;\n"; 910241a4b83SYohann code << " data.tidz = threadIdx.z;\n"; 911241a4b83SYohann code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 91218d499f1SYohann code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 913920dcdc4Sjeremylt 914818e0025SJeremy L Thompson code << "\n // -- Input field constants and basis data --\n"; 915ac421f39SYohann //Initialize constants, and matrices B and G 916241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 917818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 918241a4b83SYohann // Get elemsize, emode, ncomp 919241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 920e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 921241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 922e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 923241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 924e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9254d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 926e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 927920dcdc4Sjeremylt 928920dcdc4Sjeremylt // Set field constants 929920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT) { 930e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 931920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 932e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 933920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 934920dcdc4Sjeremylt } else { 935920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 936920dcdc4Sjeremylt } 937920dcdc4Sjeremylt code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 938920dcdc4Sjeremylt } 939920dcdc4Sjeremylt 940920dcdc4Sjeremylt // Load basis data 941920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 942241a4b83SYohann switch (emode) { 943241a4b83SYohann case CEED_EVAL_NONE: 944241a4b83SYohann break; 945241a4b83SYohann case CEED_EVAL_INTERP: 946e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 947241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 94880a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 949241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 950241a4b83SYohann break; 951241a4b83SYohann case CEED_EVAL_GRAD: 952e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 953241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 95480a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 955241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 95664d3f0c0Sjeremylt if (useCollograd) { 957ac421f39SYohann data->G.in[i] = basis_data->d_collograd1d; 95880a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 959ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 960ac421f39SYohann } else { 961ac421f39SYohann data->G.in[i] = basis_data->d_grad1d; 96280a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 963241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 964ac421f39SYohann } 965241a4b83SYohann break; 966241a4b83SYohann case CEED_EVAL_WEIGHT: 967241a4b83SYohann break; // No action 968241a4b83SYohann case CEED_EVAL_DIV: 969241a4b83SYohann break; // TODO: Not implemented 970241a4b83SYohann case CEED_EVAL_CURL: 971241a4b83SYohann break; // TODO: Not implemented 972241a4b83SYohann } 973241a4b83SYohann } 974920dcdc4Sjeremylt 975818e0025SJeremy L Thompson code << "\n // -- Output field constants and basis data --\n"; 976241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 977818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 978241a4b83SYohann // Get elemsize, emode, ncomp 979241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 980e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 981241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 982e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 983241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 984e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9854d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 986e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 987920dcdc4Sjeremylt 988920dcdc4Sjeremylt // Set field constants 989e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 990920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 991e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 992241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 993920dcdc4Sjeremylt } else { 994920dcdc4Sjeremylt code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 995920dcdc4Sjeremylt } 996920dcdc4Sjeremylt code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 997920dcdc4Sjeremylt 998920dcdc4Sjeremylt // Load basis data 999920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1000920dcdc4Sjeremylt switch (emode) { 1001920dcdc4Sjeremylt case CEED_EVAL_NONE: 1002920dcdc4Sjeremylt break; // No action 1003920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1004e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1005241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 100680a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1007241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1008241a4b83SYohann break; 1009241a4b83SYohann case CEED_EVAL_GRAD: 1010e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1011241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 101280a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1013241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 101464d3f0c0Sjeremylt if (useCollograd) { 1015ac421f39SYohann data->G.out[i] = basis_data->d_collograd1d; 101680a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1017ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1018ac421f39SYohann } else { 1019ac421f39SYohann data->G.out[i] = basis_data->d_grad1d; 102080a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1021241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1022ac421f39SYohann } 1023241a4b83SYohann break; 1024e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1025241a4b83SYohann case CEED_EVAL_WEIGHT: { 1026241a4b83SYohann Ceed ceed; 1027e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1028e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1029241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1030241a4b83SYohann break; // Should not occur 1031241a4b83SYohann } 1032241a4b83SYohann case CEED_EVAL_DIV: 1033241a4b83SYohann break; // TODO: Not implemented 1034241a4b83SYohann case CEED_EVAL_CURL: 1035241a4b83SYohann break; // TODO: Not implemented 1036e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1037241a4b83SYohann } 1038241a4b83SYohann } 1039818e0025SJeremy L Thompson code << "\n // -- Element loop --\n"; 1040ac421f39SYohann code << " __syncthreads();\n"; 1041241a4b83SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1042241a4b83SYohann // Input basis apply if needed 1043ac421f39SYohann // Generate the correct eval mode code for each input 1044818e0025SJeremy L Thompson code << " // -- Input field restrictions and basis actions --\n"; 1045241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1046818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1047241a4b83SYohann // Get elemsize, emode, ncomp 1048241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1049e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1050241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1051e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1052241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1053e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10544d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1055e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1056920dcdc4Sjeremylt 1057920dcdc4Sjeremylt // Restriction 1058920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT && 105964d3f0c0Sjeremylt !((emode == CEED_EVAL_NONE) && useCollograd)) { 1060241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 10619a2291e3SJeremy L Thompson 10629a2291e3SJeremy L Thompson bool isStrided; 1063e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 10649a2291e3SJeremy L Thompson if (!isStrided) { 10655c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1066e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10675c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 10685c7b696cSjeremylt CeedInt compstride; 1069e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 10705c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1071e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 10729a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 10735c7b696cSjeremylt code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1074ccedf6b0Sjeremylt } else { 1075920dcdc4Sjeremylt bool backendstrides; 1076fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1077e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 107813585805SJeremy L Thompson CeedInt nelem; 107913585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1080e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 108113585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1082920dcdc4Sjeremylt if (!backendstrides) { 1083920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1084e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1085ccedf6b0Sjeremylt } 1086920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1087d80fc06aSjeremylt code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1088920dcdc4Sjeremylt } 1089920dcdc4Sjeremylt } 1090920dcdc4Sjeremylt 1091920dcdc4Sjeremylt // Basis action 1092920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1093920dcdc4Sjeremylt switch (emode) { 1094920dcdc4Sjeremylt case CEED_EVAL_NONE: 109564d3f0c0Sjeremylt if (!useCollograd) { 1096920dcdc4Sjeremylt code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1097920dcdc4Sjeremylt } 1098920dcdc4Sjeremylt break; 1099920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1100241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1101241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1102241a4b83SYohann break; 1103241a4b83SYohann case CEED_EVAL_GRAD: 110464d3f0c0Sjeremylt if (useCollograd) { 1105ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1106ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1107ac421f39SYohann } else { 1108241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1109241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1110ac421f39SYohann } 1111241a4b83SYohann break; 1112241a4b83SYohann case CEED_EVAL_WEIGHT: 1113241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1114e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 1115e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1116241a4b83SYohann data->W = basis_data->d_qweight1d; 1117241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1118241a4b83SYohann break; // No action 1119241a4b83SYohann case CEED_EVAL_DIV: 1120241a4b83SYohann break; // TODO: Not implemented 1121241a4b83SYohann case CEED_EVAL_CURL: 1122241a4b83SYohann break; // TODO: Not implemented 1123241a4b83SYohann } 1124241a4b83SYohann } 1125ac421f39SYohann 1126241a4b83SYohann // Q function 1127818e0025SJeremy L Thompson code << "\n // -- Output field setup --\n"; 1128241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1129818e0025SJeremy L Thompson code << "\n // ---- Output field "<<i<<" ----\n"; 1130241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1131e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1132241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1133241a4b83SYohann { 113464d3f0c0Sjeremylt if (useCollograd) { 1135ac421f39SYohann //Accumulator for gradient slices 1136ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1137ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1138ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1139ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1140ac421f39SYohann code << " }\n"; 1141ac421f39SYohann code << " }\n"; 1142ac421f39SYohann } else { 1143241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1144241a4b83SYohann } 1145ac421f39SYohann } 1146241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1147241a4b83SYohann { 1148241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1149241a4b83SYohann } 1150241a4b83SYohann } 1151ac421f39SYohann // We treat quadrature points per slice in 3d to save registers 115264d3f0c0Sjeremylt if (useCollograd) { 1153920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1154ac421f39SYohann code << "#pragma unroll\n"; 1155ac421f39SYohann code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1156818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1157ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1158818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1159ac421f39SYohann // Get elemsize, emode, ncomp 1160ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1161e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1162ac421f39SYohann // Basis action 1163920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1164ac421f39SYohann switch (emode) { 1165ac421f39SYohann case CEED_EVAL_NONE: 1166ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 11679a2291e3SJeremy L Thompson 11689a2291e3SJeremy L Thompson bool isStrided; 1169e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr); 1170e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr); 1171e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 11729a2291e3SJeremy L Thompson if (!isStrided) { 11735c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1174e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 11755c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 11765c7b696cSjeremylt CeedInt compstride; 1177e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 11785c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1179e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 11809a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 11815c7b696cSjeremylt code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1182920dcdc4Sjeremylt } else { 1183920dcdc4Sjeremylt bool backendstrides; 1184fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1185e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 118613585805SJeremy L Thompson CeedInt nelem; 118713585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1188e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 118913585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1190920dcdc4Sjeremylt if (!backendstrides) { 1191920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1192e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1193920dcdc4Sjeremylt } 1194920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1195d80fc06aSjeremylt code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1196920dcdc4Sjeremylt } 1197ac421f39SYohann break; 1198ac421f39SYohann case CEED_EVAL_INTERP: 1199ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1200ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1201ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1202ac421f39SYohann code << " }\n"; 1203ac421f39SYohann break; 1204ac421f39SYohann case CEED_EVAL_GRAD: 1205ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1206ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1207ac421f39SYohann break; 1208ac421f39SYohann case CEED_EVAL_WEIGHT: 1209ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1210ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1211ac421f39SYohann break; // No action 1212ac421f39SYohann case CEED_EVAL_DIV: 1213ac421f39SYohann break; // TODO: Not implemented 1214ac421f39SYohann case CEED_EVAL_CURL: 1215ac421f39SYohann break; // TODO: Not implemented 1216ac421f39SYohann } 1217ac421f39SYohann } 1218818e0025SJeremy L Thompson code << "\n // -- Output fields --\n"; 1219ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1220818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1221ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1222e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1223ac421f39SYohann // Basis action 1224ac421f39SYohann switch (emode) { 1225ac421f39SYohann case CEED_EVAL_NONE: 1226ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1227ac421f39SYohann break; // No action 1228ac421f39SYohann case CEED_EVAL_INTERP: 1229ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1230ac421f39SYohann break; 1231ac421f39SYohann case CEED_EVAL_GRAD: 1232ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1233ac421f39SYohann break; 1234ac421f39SYohann case CEED_EVAL_WEIGHT: 1235ac421f39SYohann break; // Should not occur 1236ac421f39SYohann case CEED_EVAL_DIV: 1237ac421f39SYohann break; // TODO: Not implemented 1238ac421f39SYohann case CEED_EVAL_CURL: 1239ac421f39SYohann break; // TODO: Not implemented 1240ac421f39SYohann } 1241ac421f39SYohann } 1242ac421f39SYohann } else { 1243920dcdc4Sjeremylt code << "\n // Note: No Collocated Gradient\n"; 1244818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1245ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1246818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1247ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1248ac421f39SYohann } 1249818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1250ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1251818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1252ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1253ac421f39SYohann } 1254ac421f39SYohann } 1255818e0025SJeremy L Thompson code << "\n // -- QFunction Inputs and outputs --\n"; 12564d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 12574d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1258818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1259ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 12604d537eeaSYohann } 12614d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 12624d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1263818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1264ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 12654d537eeaSYohann } 1266818e0025SJeremy L Thompson code << "\n // -- Apply QFunction --\n"; 1267ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 126864d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 1269ac421f39SYohann code << "1"; 1270ac421f39SYohann } else { 1271ac421f39SYohann code << "Q1d"; 1272ac421f39SYohann } 1273ac421f39SYohann code << ", in, out);\n"; 127464d3f0c0Sjeremylt if (useCollograd) { 1275920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1276818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1277ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1278818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1279ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1280e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1281ac421f39SYohann // Basis action 1282920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1283ac421f39SYohann switch (emode) { 1284ac421f39SYohann case CEED_EVAL_NONE: 1285ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1286ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1287ac421f39SYohann code << " }\n"; 1288ac421f39SYohann break; // No action 1289ac421f39SYohann case CEED_EVAL_INTERP: 1290ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1291ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1292ac421f39SYohann code << " }\n"; 1293ac421f39SYohann break; 1294ac421f39SYohann case CEED_EVAL_GRAD: 1295ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1296ac421f39SYohann break; 1297ac421f39SYohann case CEED_EVAL_WEIGHT: 1298ac421f39SYohann break; // Should not occur 1299ac421f39SYohann case CEED_EVAL_DIV: 1300ac421f39SYohann break; // TODO: Not implemented 1301ac421f39SYohann case CEED_EVAL_CURL: 1302ac421f39SYohann break; // TODO: Not implemented 1303ac421f39SYohann } 1304ac421f39SYohann } 1305ac421f39SYohann code << " }\n"; 1306ac421f39SYohann } 1307241a4b83SYohann 1308241a4b83SYohann // Output basis apply if needed 1309ac421f39SYohann // Generate the correct eval mode code for each output 1310818e0025SJeremy L Thompson code << "\n // -- Output field basis action and restrictions --\n"; 1311241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1312818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1313241a4b83SYohann // Get elemsize, emode, ncomp 1314241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1315e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1316241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1317e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1318241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1319e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13204d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1321e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1322241a4b83SYohann // Basis action 1323920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1324241a4b83SYohann switch (emode) { 1325241a4b83SYohann case CEED_EVAL_NONE: 1326920dcdc4Sjeremylt code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1327241a4b83SYohann break; // No action 1328241a4b83SYohann case CEED_EVAL_INTERP: 1329241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1330241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1331241a4b83SYohann break; 1332241a4b83SYohann case CEED_EVAL_GRAD: 1333241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 133464d3f0c0Sjeremylt if (useCollograd) { 1335ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1336ac421f39SYohann } else { 1337241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1338ac421f39SYohann } 1339241a4b83SYohann break; 1340e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1341241a4b83SYohann case CEED_EVAL_WEIGHT: { 1342241a4b83SYohann Ceed ceed; 1343e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1344e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1345241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1346241a4b83SYohann break; // Should not occur 1347241a4b83SYohann } 1348241a4b83SYohann case CEED_EVAL_DIV: 1349241a4b83SYohann break; // TODO: Not implemented 1350241a4b83SYohann case CEED_EVAL_CURL: 1351241a4b83SYohann break; // TODO: Not implemented 1352e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1353241a4b83SYohann } 1354920dcdc4Sjeremylt // Restriction 13559a2291e3SJeremy L Thompson bool isStrided; 1356e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 13579a2291e3SJeremy L Thompson if (!isStrided) { 13585c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1359e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13605c7b696cSjeremylt code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 13615c7b696cSjeremylt CeedInt compstride; 1362e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 13635c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1364e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 13659a2291e3SJeremy L Thompson data->indices.out[i] = restr_data->d_ind; 13665c7b696cSjeremylt code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1367920dcdc4Sjeremylt } else { 1368920dcdc4Sjeremylt bool backendstrides; 1369fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1370e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 137113585805SJeremy L Thompson CeedInt nelem; 137213585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1373e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 137413585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1375920dcdc4Sjeremylt if (!backendstrides) { 1376920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1377e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1378920dcdc4Sjeremylt } 1379920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1380d80fc06aSjeremylt code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1381920dcdc4Sjeremylt } 1382241a4b83SYohann } 1383241a4b83SYohann 1384241a4b83SYohann code << " }\n"; 1385818e0025SJeremy L Thompson code << "}\n"; 1386818e0025SJeremy L Thompson code << "// -----------------------------------------------------------------------------\n\n"; 1387241a4b83SYohann 1388ab213215SJeremy L Thompson // View kernel for debugging 13893f21f6b1SJeremy L Thompson CeedDebug(ceed, code.str().c_str()); 1390241a4b83SYohann 139118d499f1SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 139218d499f1SYohann "T1d", CeedIntMax(Q1d, data->maxP1d)); 1393e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1394c3c443faSYohann Dudouit ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1395e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1396241a4b83SYohann 1397e15f9bd0SJeremy L Thompson ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 1398e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1399241a4b83SYohann } 1400ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1401