1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12 177df94212SJeremy L Thompson 18241a4b83SYohann #include "ceed-cuda-gen.h" 19241a4b83SYohann #include <iostream> 20241a4b83SYohann #include <sstream> 21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h" 22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 23241a4b83SYohann 24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 25ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 26ab213215SJeremy L Thompson // Atomic add, for older CUDA 27ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 28241a4b83SYohann __device__ double atomicAdd(double *address, double val) { 29241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 30241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 31241a4b83SYohann do { 32241a4b83SYohann assumed = old; 33241a4b83SYohann old = 34241a4b83SYohann atomicCAS(address_as_ull, assumed, 35241a4b83SYohann __double_as_longlong(val + 36241a4b83SYohann __longlong_as_double(assumed))); 37241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 38241a4b83SYohann // (since NaN != NaN) 39241a4b83SYohann } while (assumed != old); 40241a4b83SYohann return __longlong_as_double(old); 41241a4b83SYohann } 42f1a13f77SYohann Dudouit ); 43f1a13f77SYohann Dudouit 44f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 45f1a13f77SYohann Dudouit 46ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 47ab213215SJeremy L Thompson // Typedefs 48ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 49f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 50f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 51f1a13f77SYohann Dudouit 52f1a13f77SYohann Dudouit typedef struct { 53f1a13f77SYohann Dudouit CeedInt tidx; 54f1a13f77SYohann Dudouit CeedInt tidy; 55f1a13f77SYohann Dudouit CeedInt tidz; 56f1a13f77SYohann Dudouit CeedInt tid; 57f1a13f77SYohann Dudouit CeedScalar* slice; 58f1a13f77SYohann Dudouit } BackendData; 59241a4b83SYohann 60ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 61ab213215SJeremy L Thompson // Load matrices for basis actions 62ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 63241a4b83SYohann template <int P, int Q> 64241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 65ab213215SJeremy L Thompson for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 66241a4b83SYohann B[i] = d_B[i]; 67241a4b83SYohann } 68241a4b83SYohann 69ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 70241a4b83SYohann // 1D 71ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 72ab213215SJeremy L Thompson 73ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 74ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 75ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 765c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 775c7b696cSjeremylt inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 78ab213215SJeremy L Thompson if (data.tidx < P1d) { 794d537eeaSYohann const CeedInt node = data.tidx; 80ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 81ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 825c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 83241a4b83SYohann } 84241a4b83SYohann } 85920dcdc4Sjeremylt 86ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 87ab213215SJeremy L Thompson // L-vector -> E-vector, strided 88ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 89d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 90d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 91ab213215SJeremy L Thompson if (data.tidx < P1d) { 92ccedf6b0Sjeremylt const CeedInt node = data.tidx; 93d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 94ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 95d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 96ccedf6b0Sjeremylt } 97ccedf6b0Sjeremylt } 98241a4b83SYohann 99ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 100ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 101ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1025c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 1035c7b696cSjeremylt inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 104ab213215SJeremy L Thompson if (data.tidx < P1d) { 1054d537eeaSYohann const CeedInt node = data.tidx; 106ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 107ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 1085c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 109241a4b83SYohann } 110241a4b83SYohann } 111241a4b83SYohann 112ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 113ab213215SJeremy L Thompson // E-vector -> L-vector, strided 114ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 115d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 116d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 117ab213215SJeremy L Thompson if (data.tidx < P1d) { 118ccedf6b0Sjeremylt const CeedInt node = data.tidx; 119d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 120ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 121d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 122ccedf6b0Sjeremylt } 123ccedf6b0Sjeremylt } 124ccedf6b0Sjeremylt 125ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 126ab213215SJeremy L Thompson // 1D tensor contraction x 127ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 128241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 129ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 130241a4b83SYohann data.slice[data.tidx] = *U; 131241a4b83SYohann __syncthreads(); 132241a4b83SYohann *V = 0.0; 133ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 134ab213215SJeremy L Thompson *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 135241a4b83SYohann __syncthreads(); 136241a4b83SYohann } 137241a4b83SYohann 138ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 139ab213215SJeremy L Thompson // 1D transpose tensor contraction x 140ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 141241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 142ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 143241a4b83SYohann data.slice[data.tidx] = *U; 144241a4b83SYohann __syncthreads(); 145241a4b83SYohann *V = 0.0; 146ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 147ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 148241a4b83SYohann __syncthreads(); 149241a4b83SYohann } 150241a4b83SYohann 151ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 152ab213215SJeremy L Thompson // 1D interpolate to quadrature points 153ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 154241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 155ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 156ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 157241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 158241a4b83SYohann } 159241a4b83SYohann 160ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 161ab213215SJeremy L Thompson // 1D interpolate transpose 162ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 163241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 164ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 165ab213215SJeremy L Thompson for (CeedInt comp=0; comp<NCOMP; comp++) 166241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 167241a4b83SYohann } 168241a4b83SYohann 169ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 170ab213215SJeremy L Thompson // 1D derivatives at quadrature points 171ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 172241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 173ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 174ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 175241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 176241a4b83SYohann } 177241a4b83SYohann 178ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 179ab213215SJeremy L Thompson // 1D derivatives transpose 180ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 181241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 182ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 183ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 184241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 185241a4b83SYohann } 186241a4b83SYohann 187ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 188241a4b83SYohann // 2D 189ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 190ab213215SJeremy L Thompson 191ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 192ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 193ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1945c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 1955c7b696cSjeremylt inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 196ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 1974d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 198ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 199ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2005c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 201241a4b83SYohann } 202241a4b83SYohann } 203920dcdc4Sjeremylt 204ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 205ab213215SJeremy L Thompson // L-vector -> E-vector, strided 206ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 207d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 208d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 209ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 210ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 211d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 212ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 213d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 214ccedf6b0Sjeremylt } 215ccedf6b0Sjeremylt } 216241a4b83SYohann 217ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 218ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 219ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 2205c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 2215c7b696cSjeremylt inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 222ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 2234d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 224ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 225ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2265c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 227241a4b83SYohann } 228241a4b83SYohann } 229241a4b83SYohann 230ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 231ab213215SJeremy L Thompson // E-vector -> L-vector, strided 232ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 233d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 234d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 235ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 236ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 237d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 238ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 239d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 240ccedf6b0Sjeremylt } 241ccedf6b0Sjeremylt } 242ccedf6b0Sjeremylt 243ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 244ab213215SJeremy L Thompson // 2D tensor contraction x 245ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 246241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 247ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 248241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 249241a4b83SYohann __syncthreads(); 250241a4b83SYohann *V = 0.0; 251ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 252ab213215SJeremy L Thompson *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 253241a4b83SYohann __syncthreads(); 254241a4b83SYohann } 255241a4b83SYohann 256ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 257ab213215SJeremy L Thompson // 2D tensor contract y 258ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 259241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 260ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 261241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 262241a4b83SYohann __syncthreads(); 263241a4b83SYohann *V = 0.0; 264ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 265ab213215SJeremy L Thompson *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction 266241a4b83SYohann __syncthreads(); 267241a4b83SYohann } 268241a4b83SYohann 269ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 270ab213215SJeremy L Thompson // 2D transpose tensor contract y 271ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 272241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 273ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 274241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 275241a4b83SYohann __syncthreads(); 276241a4b83SYohann *V = 0.0; 277ab213215SJeremy L Thompson if (data.tidy < P1d) 278ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 279ab213215SJeremy L Thompson *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction 280241a4b83SYohann __syncthreads(); 281241a4b83SYohann } 282241a4b83SYohann 283ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 284ab213215SJeremy L Thompson // 2D transpose tensor contract x 285ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 286241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 287ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 288241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 289241a4b83SYohann __syncthreads(); 290241a4b83SYohann *V = 0.0; 291ab213215SJeremy L Thompson if (data.tidx < P1d) 292ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 293ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 294241a4b83SYohann __syncthreads(); 295241a4b83SYohann } 296241a4b83SYohann 297ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 298ab213215SJeremy L Thompson // 2D transpose tensor contract and add x 299ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 300241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 301ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 302241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 303241a4b83SYohann __syncthreads(); 304ab213215SJeremy L Thompson if (data.tidx < P1d) 305ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 306ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 307241a4b83SYohann __syncthreads(); 308241a4b83SYohann } 309241a4b83SYohann 310ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 311ab213215SJeremy L Thompson // 2D interpolate to quadrature points 312ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 313241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 314ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 315241a4b83SYohann CeedScalar r_t[1]; 316ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 317241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 318241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 319241a4b83SYohann } 320241a4b83SYohann } 321241a4b83SYohann 322ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 323ab213215SJeremy L Thompson // 2D interpolate transpose 324ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 325241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 326ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 327241a4b83SYohann CeedScalar r_t[1]; 328ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 329241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 330241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 331241a4b83SYohann } 332241a4b83SYohann } 333241a4b83SYohann 334ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 335ab213215SJeremy L Thompson // 2D derivatives at quadrature points 336ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 337241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 338ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 339241a4b83SYohann CeedScalar r_t[1]; 340ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 341241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 342241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 343241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 344241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 345241a4b83SYohann } 346241a4b83SYohann } 347241a4b83SYohann 348ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 349ab213215SJeremy L Thompson // 2D derivatives transpose 350ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 351241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 352ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 353241a4b83SYohann CeedScalar r_t[1]; 354ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 355241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 356241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 357241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 358241a4b83SYohann ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 359241a4b83SYohann } 360241a4b83SYohann } 361241a4b83SYohann 362ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 363241a4b83SYohann // 3D 364ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 365ab213215SJeremy L Thompson 366ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 367ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 368ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3695c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 3705c7b696cSjeremylt inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 371ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 372241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3734d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 374ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 375ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 3765c7b696cSjeremylt r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 377241a4b83SYohann } 378241a4b83SYohann } 379920dcdc4Sjeremylt 380ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 381ab213215SJeremy L Thompson // L-vector -> E-vector, strided 382ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 383d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 384d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 385ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 386ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 387ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 388d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 389ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 390d80fc06aSjeremylt r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 391ccedf6b0Sjeremylt } 392ccedf6b0Sjeremylt } 393241a4b83SYohann 394ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 395ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided 396ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3975c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d> 3985c7b696cSjeremylt inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 399ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 400920dcdc4Sjeremylt const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 401ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4025c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 403ac421f39SYohann } 404ac421f39SYohann 405ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 406ab213215SJeremy L Thompson // E-vector -> Q-vector, strided 407ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 408d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 40925dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 410920dcdc4Sjeremylt const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 411d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 412ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 413d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 414920dcdc4Sjeremylt } 415920dcdc4Sjeremylt 416ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 417ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 418ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4195c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 4205c7b696cSjeremylt inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 421ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 422241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4234d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 424ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 425ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4265c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 427241a4b83SYohann } 428241a4b83SYohann } 429241a4b83SYohann 430ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 431ab213215SJeremy L Thompson // E-vector -> L-vector, strided 432ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 433d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4342f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 435ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 436ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 437ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 438d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 439ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 440d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 441ccedf6b0Sjeremylt } 442ccedf6b0Sjeremylt } 443ccedf6b0Sjeremylt 444ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 445ab213215SJeremy L Thompson // 3D tensor contract x 446ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 447241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 448ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 449ac421f39SYohann CeedScalar r_B[P1d]; 450ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 451ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 452ab213215SJeremy L Thompson 453ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 454241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 455241a4b83SYohann __syncthreads(); 456241a4b83SYohann V[k] = 0.0; 457ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 458ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 459241a4b83SYohann __syncthreads(); 460241a4b83SYohann } 461241a4b83SYohann } 462241a4b83SYohann 463ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 464ab213215SJeremy L Thompson // 3D tensor contract y 465ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 466241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 467ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 468ac421f39SYohann CeedScalar r_B[P1d]; 469ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 470ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 471ab213215SJeremy L Thompson 472ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 473241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 474241a4b83SYohann __syncthreads(); 475241a4b83SYohann V[k] = 0.0; 476ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 477ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 478241a4b83SYohann __syncthreads(); 479241a4b83SYohann } 480241a4b83SYohann } 481241a4b83SYohann 482ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 483ab213215SJeremy L Thompson // 3D tensor contract z 484ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 485241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 486ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 487ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 488241a4b83SYohann V[k] = 0.0; 489ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 490ab213215SJeremy L Thompson V[k] += B[i + k*P1d] * U[i]; // Contract z direction 491241a4b83SYohann } 492241a4b83SYohann } 493241a4b83SYohann 494ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 495ab213215SJeremy L Thompson // 3D transpose tensor contract z 496ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 497241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 498ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 499ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 500241a4b83SYohann V[k] = 0.0; 501ab213215SJeremy L Thompson if (k < P1d) 502ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 503ab213215SJeremy L Thompson V[k] += B[k + i*P1d] * U[i]; // Contract z direction 504241a4b83SYohann } 505241a4b83SYohann } 506241a4b83SYohann 507ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 508ab213215SJeremy L Thompson // 3D transpose tensor contract y 509ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 510241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 511ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 512ac421f39SYohann CeedScalar r_B[Q1d]; 513ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 514ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 515ab213215SJeremy L Thompson 516ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 517241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 518241a4b83SYohann __syncthreads(); 519241a4b83SYohann V[k] = 0.0; 520ab213215SJeremy L Thompson if (data.tidy < P1d) 521ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 522ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 523ac421f39SYohann __syncthreads(); 524ac421f39SYohann } 525ac421f39SYohann } 526ac421f39SYohann 527ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 528ab213215SJeremy L Thompson // 3D transpose tensor contract add y 529ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 530ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 531ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 532ac421f39SYohann CeedScalar r_B[Q1d]; 533ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 534ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 535ab213215SJeremy L Thompson 536ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 537ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 538ac421f39SYohann __syncthreads(); 539ab213215SJeremy L Thompson if (data.tidy < P1d) 540ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 541ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 542241a4b83SYohann __syncthreads(); 543241a4b83SYohann } 544241a4b83SYohann } 545241a4b83SYohann 546ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 547ab213215SJeremy L Thompson // 3D transpose tensor contract x 548ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 549241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 550ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 551ac421f39SYohann CeedScalar r_B[Q1d]; 552ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 553ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 554ab213215SJeremy L Thompson 555ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 556241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 557241a4b83SYohann __syncthreads(); 558241a4b83SYohann V[k] = 0.0; 559ab213215SJeremy L Thompson if (data.tidx < P1d) 560ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 561ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 562241a4b83SYohann __syncthreads(); 563241a4b83SYohann } 564241a4b83SYohann } 565241a4b83SYohann 566ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 567ab213215SJeremy L Thompson // 3D transpose tensor contract add x 568ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 569241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 570ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 571ac421f39SYohann CeedScalar r_B[Q1d]; 572ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 573ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 574ab213215SJeremy L Thompson 575ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 576241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 577241a4b83SYohann __syncthreads(); 578ab213215SJeremy L Thompson if (data.tidx < P1d) 579ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 580ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 581241a4b83SYohann __syncthreads(); 582241a4b83SYohann } 583241a4b83SYohann } 584241a4b83SYohann 585ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 586ab213215SJeremy L Thompson // 3D interpolate to quadrature points 587ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 588241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 589ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 590241a4b83SYohann CeedScalar r_t1[Q1d]; 591241a4b83SYohann CeedScalar r_t2[Q1d]; 592ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 593241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 594241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 595241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 596241a4b83SYohann } 597241a4b83SYohann } 598241a4b83SYohann 599ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 600ab213215SJeremy L Thompson // 3D interpolate transpose 601ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 602241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 603ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 604241a4b83SYohann CeedScalar r_t1[Q1d]; 605241a4b83SYohann CeedScalar r_t2[Q1d]; 606ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 607241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 608241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 609241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 610241a4b83SYohann } 611241a4b83SYohann } 612241a4b83SYohann 613ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 614ab213215SJeremy L Thompson // 3D derivatives at quadrature points 615ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 616241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 617ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 618241a4b83SYohann CeedScalar r_t1[Q1d]; 619241a4b83SYohann CeedScalar r_t2[Q1d]; 620ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 621241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 622241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 623241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 624241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 625241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 626241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 627241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 628241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 629241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 630241a4b83SYohann } 631241a4b83SYohann } 632241a4b83SYohann 633ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 634ab213215SJeremy L Thompson // 3D derivatives transpose 635ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 636241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 637ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 638241a4b83SYohann CeedScalar r_t1[Q1d]; 639241a4b83SYohann CeedScalar r_t2[Q1d]; 640ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 641241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 642241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 643241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 644241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 645241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 646241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 647241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 648241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 649241a4b83SYohann ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 650241a4b83SYohann } 651241a4b83SYohann } 652241a4b83SYohann 653ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 654ab213215SJeremy L Thompson // 3D collocated derivatives computation 655ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 656ac421f39SYohann template <int NCOMP, int Q1d> 657ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 658ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 659ac421f39SYohann data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d]; 660ac421f39SYohann __syncthreads(); 661ac421f39SYohann // X derivative 662ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 663ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 664ab213215SJeremy L Thompson r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative) 665ac421f39SYohann // Y derivative 666ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 667ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 668ab213215SJeremy L Thompson r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative) 669ac421f39SYohann // Z derivative 670ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 671ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 672ab213215SJeremy L Thompson r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 673ac421f39SYohann __syncthreads(); 674ac421f39SYohann } 675ac421f39SYohann } 676ac421f39SYohann 677ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 678ab213215SJeremy L Thompson // 3D collocated derivatives transpose 679ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 680ac421f39SYohann template <int NCOMP, int Q1d> 681ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 682ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 683ac421f39SYohann // X derivative 684ac421f39SYohann data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 685ac421f39SYohann __syncthreads(); 686ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 687ab213215SJeremy L Thompson r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative) 688ac421f39SYohann __syncthreads(); 689ac421f39SYohann // Y derivative 690ac421f39SYohann data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 691ac421f39SYohann __syncthreads(); 692ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 693ab213215SJeremy L Thompson r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative) 694ac421f39SYohann __syncthreads(); 695ac421f39SYohann // Z derivative 696ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 697ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 698ac421f39SYohann } 699ac421f39SYohann } 700ac421f39SYohann 701ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 702ab213215SJeremy L Thompson // 1D quadrature weights 703ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 704241a4b83SYohann template <int Q1d> 705241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 706241a4b83SYohann *w = qweight1d[data.tidx]; 707241a4b83SYohann } 708241a4b83SYohann 709ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 710ab213215SJeremy L Thompson // 2D quadrature weights 711ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 712241a4b83SYohann template <int Q1d> 713241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 714241a4b83SYohann *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 715241a4b83SYohann } 716241a4b83SYohann 717ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 718ab213215SJeremy L Thompson // 3D quadrature weights 719ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 720241a4b83SYohann template <int Q1d> 721241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 722241a4b83SYohann const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 723ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 724241a4b83SYohann w[z] = pw*qweight1d[z]; 725241a4b83SYohann } 726241a4b83SYohann 727241a4b83SYohann ); 728ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 729ab213215SJeremy L Thompson // Build singe operator kernel 730ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 731241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 732241a4b83SYohann 733241a4b83SYohann using std::ostringstream; 734241a4b83SYohann using std::string; 735241a4b83SYohann int ierr; 736241a4b83SYohann bool setupdone; 737fd364f38SJeremy L Thompson ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr); 738241a4b83SYohann if (setupdone) return 0; 739241a4b83SYohann Ceed ceed; 740241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 741241a4b83SYohann CeedOperator_Cuda_gen *data; 7426c845298Sjeremylt ierr = CeedOperatorGetData(op, &data); CeedChk(ierr); 743241a4b83SYohann CeedQFunction qf; 744241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 745241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 7466c845298Sjeremylt ierr = CeedQFunctionGetData(qf, &qf_data); CeedChk(ierr); 7471da99368SJeremy L Thompson CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 7485c7b696cSjeremylt numoutputfields, ncomp, dim = 0, lsize; 749241a4b83SYohann ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 750241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 751241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 752241a4b83SYohann CeedChk(ierr); 753241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 754241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 755241a4b83SYohann CeedChk(ierr); 756241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 757241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 758241a4b83SYohann CeedChk(ierr); 759241a4b83SYohann CeedEvalMode emode; 760241a4b83SYohann CeedBasis basis; 761241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 762241a4b83SYohann CeedElemRestriction Erestrict; 763241a4b83SYohann CeedElemRestriction_Cuda_reg *restr_data; 764241a4b83SYohann 765241a4b83SYohann ostringstream code; 766241a4b83SYohann string devFunctions(deviceFunctions); 767241a4b83SYohann 768f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 769f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 770f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 7716c845298Sjeremylt ierr = CeedGetData(ceed, &ceed_data); CeedChk(ierr); 772f1a13f77SYohann Dudouit ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 773f1a13f77SYohann Dudouit if (prop.major<6){ 774f1a13f77SYohann Dudouit code << atomicAdd; 775f1a13f77SYohann Dudouit } 776f1a13f77SYohann Dudouit 777241a4b83SYohann code << devFunctions; 778241a4b83SYohann 779241a4b83SYohann string qFunction(qf_data->qFunctionSource); 780c3c443faSYohann Dudouit string qFunctionName(qf_data->qFunctionName); 781c3c443faSYohann Dudouit string oper; 78214cce66cSYohann Dudouit oper = "CeedKernel_Cuda_gen_" + qFunctionName; 7834d537eeaSYohann 7844d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 785ee07ded2SValeria Barra code << "\n#define CeedPragmaSIMD\n"; 7861da99368SJeremy L Thompson 7871da99368SJeremy L Thompson // Find dim and Q1d 788*0f54b25eSjeremylt bool collograd = true; 7891da99368SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 7901da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 7911da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 7926c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 793*0f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 794*0f54b25eSjeremylt CeedChk(ierr); 7951da99368SJeremy L Thompson 7961da99368SJeremy L Thompson // Check for collocated gradient 797*0f54b25eSjeremylt if (emode == CEED_EVAL_GRAD) 798*0f54b25eSjeremylt collograd = collograd && !!basis_data->d_collograd1d; 7991da99368SJeremy L Thompson 8001da99368SJeremy L Thompson // Collect dim and Q1d 8011da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 8021da99368SJeremy L Thompson bool isTensor; 803fd364f38SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 8041da99368SJeremy L Thompson if (isTensor) { 8051da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 8061da99368SJeremy L Thompson } else { 807e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 8081da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 809e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8101da99368SJeremy L Thompson } 8111da99368SJeremy L Thompson } 8121da99368SJeremy L Thompson } 8131da99368SJeremy L Thompson // Check output bases for Q1d, dim as well 8141da99368SJeremy L Thompson // The only imput basis might be CEED_BASIS_COLLOCATED 8151da99368SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 8161da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 817*0f54b25eSjeremylt 8181da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 8196c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 820*0f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 821*0f54b25eSjeremylt CeedChk(ierr); 822*0f54b25eSjeremylt 8231da99368SJeremy L Thompson // Collect dim and Q1d 8241da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 8251da99368SJeremy L Thompson bool isTensor; 826fd364f38SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 8271da99368SJeremy L Thompson if (isTensor) { 8281da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 8291da99368SJeremy L Thompson } else { 830e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 8311da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 832e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8331da99368SJeremy L Thompson } 834*0f54b25eSjeremylt 835*0f54b25eSjeremylt // Check for collocated gradient 836*0f54b25eSjeremylt if (emode == CEED_EVAL_GRAD) 837*0f54b25eSjeremylt collograd = collograd && !!basis_data->d_collograd1d; 8381da99368SJeremy L Thompson } 8391da99368SJeremy L Thompson } 8401da99368SJeremy L Thompson data->dim = dim; 8411da99368SJeremy L Thompson data->Q1d = Q1d; 8421da99368SJeremy L Thompson 8431da99368SJeremy L Thompson // Define CEED_Q_VLA 8441da99368SJeremy L Thompson if (dim != 3 || collograd) { 8451da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA 1\n\n"; 8461da99368SJeremy L Thompson } else { 8471da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 8481da99368SJeremy L Thompson } 8491da99368SJeremy L Thompson 850241a4b83SYohann code << qFunction; 851241a4b83SYohann 852241a4b83SYohann // Setup 853818e0025SJeremy L Thompson code << "\n// -----------------------------------------------------------------------------\n"; 854c3c443faSYohann Dudouit code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 855241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 856241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 857241a4b83SYohann CeedChk(ierr); 8581da99368SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 859241a4b83SYohann code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 860241a4b83SYohann } 861241a4b83SYohann } 862241a4b83SYohann 863241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 864241a4b83SYohann code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 865241a4b83SYohann } 8661da99368SJeremy L Thompson 867241a4b83SYohann code << " const CeedInt Dim = "<<dim<<";\n"; 868241a4b83SYohann code << " const CeedInt Q1d = "<<Q1d<<";\n"; 8691da99368SJeremy L Thompson 870241a4b83SYohann code << " extern __shared__ CeedScalar slice[];\n"; 871241a4b83SYohann code << " BackendData data;\n"; 872241a4b83SYohann code << " data.tidx = threadIdx.x;\n"; 873241a4b83SYohann code << " data.tidy = threadIdx.y;\n"; 874241a4b83SYohann code << " data.tidz = threadIdx.z;\n"; 875241a4b83SYohann code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 876241a4b83SYohann code << " data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 877920dcdc4Sjeremylt 878818e0025SJeremy L Thompson code << "\n // -- Input field constants and basis data --\n"; 879ac421f39SYohann //Initialize constants, and matrices B and G 880241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 881818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 882241a4b83SYohann // Get elemsize, emode, ncomp 883241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 884241a4b83SYohann CeedChk(ierr); 885241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 886241a4b83SYohann CeedChk(ierr); 887241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 888241a4b83SYohann CeedChk(ierr); 8894d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 890241a4b83SYohann CeedChk(ierr); 891920dcdc4Sjeremylt 892920dcdc4Sjeremylt // Set field constants 893920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT) { 894920dcdc4Sjeremylt ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 895920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 896920dcdc4Sjeremylt ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 897920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 898920dcdc4Sjeremylt } else { 899920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 900920dcdc4Sjeremylt } 901920dcdc4Sjeremylt code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 902920dcdc4Sjeremylt } 903920dcdc4Sjeremylt 904920dcdc4Sjeremylt // Load basis data 905920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 906241a4b83SYohann switch (emode) { 907241a4b83SYohann case CEED_EVAL_NONE: 908241a4b83SYohann break; 909241a4b83SYohann case CEED_EVAL_INTERP: 9106c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 911241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 912241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 913241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 914241a4b83SYohann break; 915241a4b83SYohann case CEED_EVAL_GRAD: 9166c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 917241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 918241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 919241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 920*0f54b25eSjeremylt if (collograd) { 921ac421f39SYohann data->G.in[i] = basis_data->d_collograd1d; 922ac421f39SYohann code << " __shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 923ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 924ac421f39SYohann } else { 925ac421f39SYohann data->G.in[i] = basis_data->d_grad1d; 926ac421f39SYohann code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 927241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 928ac421f39SYohann } 929241a4b83SYohann break; 930241a4b83SYohann case CEED_EVAL_WEIGHT: 931241a4b83SYohann break; // No action 932241a4b83SYohann case CEED_EVAL_DIV: 933241a4b83SYohann break; // TODO: Not implemented 934241a4b83SYohann case CEED_EVAL_CURL: 935241a4b83SYohann break; // TODO: Not implemented 936241a4b83SYohann } 937241a4b83SYohann } 938920dcdc4Sjeremylt 939818e0025SJeremy L Thompson code << "\n // -- Output field constants and basis data --\n"; 940241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 941818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 942241a4b83SYohann // Get elemsize, emode, ncomp 943241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 944241a4b83SYohann CeedChk(ierr); 945241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 946241a4b83SYohann CeedChk(ierr); 947241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 948241a4b83SYohann CeedChk(ierr); 9494d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 950241a4b83SYohann CeedChk(ierr); 951920dcdc4Sjeremylt 952920dcdc4Sjeremylt // Set field constants 953241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 954920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 955241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 956241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 957920dcdc4Sjeremylt } else { 958920dcdc4Sjeremylt code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 959920dcdc4Sjeremylt } 960920dcdc4Sjeremylt code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 961920dcdc4Sjeremylt 962920dcdc4Sjeremylt // Load basis data 963920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 964920dcdc4Sjeremylt switch (emode) { 965920dcdc4Sjeremylt case CEED_EVAL_NONE: 966920dcdc4Sjeremylt break; // No action 967920dcdc4Sjeremylt case CEED_EVAL_INTERP: 9686c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 969241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 970241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 971241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 972241a4b83SYohann break; 973241a4b83SYohann case CEED_EVAL_GRAD: 9746c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 975241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 976241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 977241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 978*0f54b25eSjeremylt if (collograd) { 979ac421f39SYohann data->G.out[i] = basis_data->d_collograd1d; 980ac421f39SYohann code << " __shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 981ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 982ac421f39SYohann } else { 983ac421f39SYohann data->G.out[i] = basis_data->d_grad1d; 984ac421f39SYohann code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 985241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 986ac421f39SYohann } 987241a4b83SYohann break; 988e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 989241a4b83SYohann case CEED_EVAL_WEIGHT: { 990241a4b83SYohann Ceed ceed; 991241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 992241a4b83SYohann return CeedError(ceed, 1, 993241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 994241a4b83SYohann break; // Should not occur 995241a4b83SYohann } 996241a4b83SYohann case CEED_EVAL_DIV: 997241a4b83SYohann break; // TODO: Not implemented 998241a4b83SYohann case CEED_EVAL_CURL: 999241a4b83SYohann break; // TODO: Not implemented 1000e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1001241a4b83SYohann } 1002241a4b83SYohann } 1003818e0025SJeremy L Thompson code << "\n // -- Element loop --\n"; 1004ac421f39SYohann code << " __syncthreads();\n"; 1005241a4b83SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1006241a4b83SYohann // Input basis apply if needed 1007ac421f39SYohann // Generate the correct eval mode code for each input 1008818e0025SJeremy L Thompson code << " // -- Input field restrictions and basis actions --\n"; 1009241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1010818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1011241a4b83SYohann // Get elemsize, emode, ncomp 1012241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1013241a4b83SYohann CeedChk(ierr); 1014241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1015241a4b83SYohann CeedChk(ierr); 1016241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1017241a4b83SYohann CeedChk(ierr); 10184d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1019241a4b83SYohann CeedChk(ierr); 1020920dcdc4Sjeremylt 1021920dcdc4Sjeremylt // Restriction 1022920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT && 1023*0f54b25eSjeremylt !((emode == CEED_EVAL_NONE) && collograd)) { 1024241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 10259a2291e3SJeremy L Thompson 10269a2291e3SJeremy L Thompson bool isStrided; 1027fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 10289a2291e3SJeremy L Thompson if (!isStrided) { 10295c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 10305c7b696cSjeremylt CeedChk(ierr); 10315c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 10325c7b696cSjeremylt CeedInt compstride; 10335c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 10345c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 10356c845298Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 10369a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 10375c7b696cSjeremylt code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1038ccedf6b0Sjeremylt } else { 1039920dcdc4Sjeremylt bool backendstrides; 1040fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1041920dcdc4Sjeremylt CeedChk(ierr); 104213585805SJeremy L Thompson CeedInt nelem; 104313585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 104413585805SJeremy L Thompson CeedChk(ierr); 104513585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1046920dcdc4Sjeremylt if (!backendstrides) { 1047920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1048920dcdc4Sjeremylt CeedChk(ierr); 1049ccedf6b0Sjeremylt } 1050920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1051d80fc06aSjeremylt code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1052920dcdc4Sjeremylt } 1053920dcdc4Sjeremylt } 1054920dcdc4Sjeremylt 1055920dcdc4Sjeremylt // Basis action 1056920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1057920dcdc4Sjeremylt switch (emode) { 1058920dcdc4Sjeremylt case CEED_EVAL_NONE: 1059*0f54b25eSjeremylt if (!collograd) { 1060920dcdc4Sjeremylt code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1061920dcdc4Sjeremylt } 1062920dcdc4Sjeremylt break; 1063920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1064241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1065241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1066241a4b83SYohann break; 1067241a4b83SYohann case CEED_EVAL_GRAD: 1068*0f54b25eSjeremylt if (collograd) { 1069ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1070ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1071ac421f39SYohann } else { 1072241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1073241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1074ac421f39SYohann } 1075241a4b83SYohann break; 1076241a4b83SYohann case CEED_EVAL_WEIGHT: 1077241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1078241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 10796c845298Sjeremylt ierr = CeedBasisGetData(basis, &basis_data); CeedChk(ierr); 1080241a4b83SYohann data->W = basis_data->d_qweight1d; 1081241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1082241a4b83SYohann break; // No action 1083241a4b83SYohann case CEED_EVAL_DIV: 1084241a4b83SYohann break; // TODO: Not implemented 1085241a4b83SYohann case CEED_EVAL_CURL: 1086241a4b83SYohann break; // TODO: Not implemented 1087241a4b83SYohann } 1088241a4b83SYohann } 1089ac421f39SYohann 1090241a4b83SYohann // Q function 1091818e0025SJeremy L Thompson code << "\n // -- Output field setup --\n"; 1092241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1093818e0025SJeremy L Thompson code << "\n // ---- Output field "<<i<<" ----\n"; 1094241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1095241a4b83SYohann CeedChk(ierr); 1096241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1097241a4b83SYohann { 1098*0f54b25eSjeremylt if (collograd) { 1099ac421f39SYohann //Accumulator for gradient slices 1100ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1101ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1102ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1103ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1104ac421f39SYohann code << " }\n"; 1105ac421f39SYohann code << " }\n"; 1106ac421f39SYohann } else { 1107241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1108241a4b83SYohann } 1109ac421f39SYohann } 1110241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1111241a4b83SYohann { 1112241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1113241a4b83SYohann } 1114241a4b83SYohann } 1115ac421f39SYohann // We treat quadrature points per slice in 3d to save registers 1116*0f54b25eSjeremylt if (collograd) { 1117920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1118ac421f39SYohann code << "#pragma unroll\n"; 1119ac421f39SYohann code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1120818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1121ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1122818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1123ac421f39SYohann // Get elemsize, emode, ncomp 1124ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1125ac421f39SYohann CeedChk(ierr); 1126ac421f39SYohann // Basis action 1127920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1128ac421f39SYohann switch (emode) { 1129ac421f39SYohann case CEED_EVAL_NONE: 1130ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 11319a2291e3SJeremy L Thompson 11329a2291e3SJeremy L Thompson bool isStrided; 1133792ff326SYohann Dudouit ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChk(ierr); 113413585805SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChk(ierr); 1135fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 11369a2291e3SJeremy L Thompson if (!isStrided) { 11375c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 11385c7b696cSjeremylt CeedChk(ierr); 11395c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 11405c7b696cSjeremylt CeedInt compstride; 11415c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 11425c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 11436c845298Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 11449a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 11455c7b696cSjeremylt code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1146920dcdc4Sjeremylt } else { 1147920dcdc4Sjeremylt bool backendstrides; 1148fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1149920dcdc4Sjeremylt CeedChk(ierr); 115013585805SJeremy L Thompson CeedInt nelem; 115113585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 115213585805SJeremy L Thompson CeedChk(ierr); 115313585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1154920dcdc4Sjeremylt if (!backendstrides) { 1155920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1156920dcdc4Sjeremylt CeedChk(ierr); 1157920dcdc4Sjeremylt } 1158920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1159d80fc06aSjeremylt code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1160920dcdc4Sjeremylt } 1161ac421f39SYohann break; 1162ac421f39SYohann case CEED_EVAL_INTERP: 1163ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1164ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1165ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1166ac421f39SYohann code << " }\n"; 1167ac421f39SYohann break; 1168ac421f39SYohann case CEED_EVAL_GRAD: 1169ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1170ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1171ac421f39SYohann break; 1172ac421f39SYohann case CEED_EVAL_WEIGHT: 1173ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1174ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1175ac421f39SYohann break; // No action 1176ac421f39SYohann case CEED_EVAL_DIV: 1177ac421f39SYohann break; // TODO: Not implemented 1178ac421f39SYohann case CEED_EVAL_CURL: 1179ac421f39SYohann break; // TODO: Not implemented 1180ac421f39SYohann } 1181ac421f39SYohann } 1182818e0025SJeremy L Thompson code << "\n // -- Output fields --\n"; 1183ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1184818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1185ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1186ac421f39SYohann CeedChk(ierr); 1187ac421f39SYohann // Basis action 1188ac421f39SYohann switch (emode) { 1189ac421f39SYohann case CEED_EVAL_NONE: 1190ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1191ac421f39SYohann break; // No action 1192ac421f39SYohann case CEED_EVAL_INTERP: 1193ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1194ac421f39SYohann break; 1195ac421f39SYohann case CEED_EVAL_GRAD: 1196ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1197ac421f39SYohann break; 1198ac421f39SYohann case CEED_EVAL_WEIGHT: 1199ac421f39SYohann break; // Should not occur 1200ac421f39SYohann case CEED_EVAL_DIV: 1201ac421f39SYohann break; // TODO: Not implemented 1202ac421f39SYohann case CEED_EVAL_CURL: 1203ac421f39SYohann break; // TODO: Not implemented 1204ac421f39SYohann } 1205ac421f39SYohann } 1206ac421f39SYohann } else { 1207920dcdc4Sjeremylt code << "\n // Note: No Collocated Gradient\n"; 1208818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1209ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1210818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1211ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1212ac421f39SYohann } 1213818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1214ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1215818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1216ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1217ac421f39SYohann } 1218ac421f39SYohann } 1219818e0025SJeremy L Thompson code << "\n // -- QFunction Inputs and outputs --\n"; 12204d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 12214d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1222818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1223ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 12244d537eeaSYohann } 12254d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 12264d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1227818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1228ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 12294d537eeaSYohann } 1230818e0025SJeremy L Thompson code << "\n // -- Apply QFunction --\n"; 1231ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 1232*0f54b25eSjeremylt if (dim != 3 || collograd) { 1233ac421f39SYohann code << "1"; 1234ac421f39SYohann } else { 1235ac421f39SYohann code << "Q1d"; 1236ac421f39SYohann } 1237ac421f39SYohann code << ", in, out);\n"; 1238*0f54b25eSjeremylt if (collograd) { 1239920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1240818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1241ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1242818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1243ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1244ac421f39SYohann CeedChk(ierr); 1245ac421f39SYohann // Basis action 1246920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1247ac421f39SYohann switch (emode) { 1248ac421f39SYohann case CEED_EVAL_NONE: 1249ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1250ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1251ac421f39SYohann code << " }\n"; 1252ac421f39SYohann break; // No action 1253ac421f39SYohann case CEED_EVAL_INTERP: 1254ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1255ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1256ac421f39SYohann code << " }\n"; 1257ac421f39SYohann break; 1258ac421f39SYohann case CEED_EVAL_GRAD: 1259ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1260ac421f39SYohann break; 1261ac421f39SYohann case CEED_EVAL_WEIGHT: 1262ac421f39SYohann break; // Should not occur 1263ac421f39SYohann case CEED_EVAL_DIV: 1264ac421f39SYohann break; // TODO: Not implemented 1265ac421f39SYohann case CEED_EVAL_CURL: 1266ac421f39SYohann break; // TODO: Not implemented 1267ac421f39SYohann } 1268ac421f39SYohann } 1269ac421f39SYohann code << " }\n"; 1270ac421f39SYohann } 1271241a4b83SYohann 1272241a4b83SYohann // Output basis apply if needed 1273ac421f39SYohann // Generate the correct eval mode code for each output 1274818e0025SJeremy L Thompson code << "\n // -- Output field basis action and restrictions --\n"; 1275241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1276818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1277241a4b83SYohann // Get elemsize, emode, ncomp 1278241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1279241a4b83SYohann CeedChk(ierr); 1280241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1281241a4b83SYohann CeedChk(ierr); 1282241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1283241a4b83SYohann CeedChk(ierr); 12844d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1285241a4b83SYohann CeedChk(ierr); 1286241a4b83SYohann // Basis action 1287920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1288241a4b83SYohann switch (emode) { 1289241a4b83SYohann case CEED_EVAL_NONE: 1290920dcdc4Sjeremylt code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1291241a4b83SYohann break; // No action 1292241a4b83SYohann case CEED_EVAL_INTERP: 1293241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1294241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1295241a4b83SYohann break; 1296241a4b83SYohann case CEED_EVAL_GRAD: 1297241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1298*0f54b25eSjeremylt if (collograd) { 1299ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1300ac421f39SYohann } else { 1301241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1302ac421f39SYohann } 1303241a4b83SYohann break; 1304e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1305241a4b83SYohann case CEED_EVAL_WEIGHT: { 1306241a4b83SYohann Ceed ceed; 1307241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1308241a4b83SYohann return CeedError(ceed, 1, 1309241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1310241a4b83SYohann break; // Should not occur 1311241a4b83SYohann } 1312241a4b83SYohann case CEED_EVAL_DIV: 1313241a4b83SYohann break; // TODO: Not implemented 1314241a4b83SYohann case CEED_EVAL_CURL: 1315241a4b83SYohann break; // TODO: Not implemented 1316e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1317241a4b83SYohann } 1318920dcdc4Sjeremylt // Restriction 13199a2291e3SJeremy L Thompson bool isStrided; 1320fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 13219a2291e3SJeremy L Thompson if (!isStrided) { 13225c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 13235c7b696cSjeremylt CeedChk(ierr); 13245c7b696cSjeremylt code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 13255c7b696cSjeremylt CeedInt compstride; 13265c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 13275c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 13286c845298Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChk(ierr); 13299a2291e3SJeremy L Thompson data->indices.out[i] = restr_data->d_ind; 13305c7b696cSjeremylt code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1331920dcdc4Sjeremylt } else { 1332920dcdc4Sjeremylt bool backendstrides; 1333fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1334920dcdc4Sjeremylt CeedChk(ierr); 133513585805SJeremy L Thompson CeedInt nelem; 133613585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 133713585805SJeremy L Thompson CeedChk(ierr); 133813585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1339920dcdc4Sjeremylt if (!backendstrides) { 1340920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1341920dcdc4Sjeremylt CeedChk(ierr); 1342920dcdc4Sjeremylt } 1343920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1344d80fc06aSjeremylt code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1345920dcdc4Sjeremylt } 1346241a4b83SYohann } 1347241a4b83SYohann 1348241a4b83SYohann code << " }\n"; 1349818e0025SJeremy L Thompson code << "}\n"; 1350818e0025SJeremy L Thompson code << "// -----------------------------------------------------------------------------\n\n"; 1351241a4b83SYohann 1352ab213215SJeremy L Thompson // View kernel for debugging 1353b8e71988SJeremy L Thompson CeedDebug(code.str().c_str()); 1354241a4b83SYohann 1355920dcdc4Sjeremylt ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); 1356920dcdc4Sjeremylt CeedChk(ierr); 1357c3c443faSYohann Dudouit ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1358241a4b83SYohann CeedChk(ierr); 1359241a4b83SYohann 1360241a4b83SYohann ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1361241a4b83SYohann return 0; 1362241a4b83SYohann } 1363ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1364