1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 163d576824SJeremy L Thompson 17b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12 187df94212SJeremy L Thompson 19ec3da8bcSJed Brown #include <ceed/ceed.h> 20ec3da8bcSJed Brown #include <ceed/backend.h> 213d576824SJeremy L Thompson #include <cuda_runtime.h> 22241a4b83SYohann #include <iostream> 23241a4b83SYohann #include <sstream> 243d576824SJeremy L Thompson #include "ceed-cuda-gen.h" 25241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 26241a4b83SYohann 27f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 28ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 29ab213215SJeremy L Thompson // Atomic add, for older CUDA 30ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3180a9ef05SNatalie Beams __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) { 32241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 33241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 34241a4b83SYohann do { 35241a4b83SYohann assumed = old; 36241a4b83SYohann old = 37241a4b83SYohann atomicCAS(address_as_ull, assumed, 38241a4b83SYohann __double_as_longlong(val + 39241a4b83SYohann __longlong_as_double(assumed))); 40241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 41241a4b83SYohann // (since NaN != NaN) 42241a4b83SYohann } while (assumed != old); 43241a4b83SYohann return __longlong_as_double(old); 44241a4b83SYohann } 45f1a13f77SYohann Dudouit ); 46f1a13f77SYohann Dudouit 47f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 48f1a13f77SYohann Dudouit 49ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 50ab213215SJeremy L Thompson // Typedefs 51ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 52f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 53f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 54f1a13f77SYohann Dudouit 55f1a13f77SYohann Dudouit typedef struct { 56f1a13f77SYohann Dudouit CeedInt tidx; 57f1a13f77SYohann Dudouit CeedInt tidy; 58f1a13f77SYohann Dudouit CeedInt tidz; 59f1a13f77SYohann Dudouit CeedInt tid; 60f1a13f77SYohann Dudouit CeedScalar* slice; 61f1a13f77SYohann Dudouit } BackendData; 62241a4b83SYohann 63ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 64ab213215SJeremy L Thompson // Load matrices for basis actions 65ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 66241a4b83SYohann template <int P, int Q> 67053543deSYohann Dudouit inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 68ab213215SJeremy L Thompson for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 69241a4b83SYohann B[i] = d_B[i]; 70241a4b83SYohann } 71241a4b83SYohann 72ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 73241a4b83SYohann // 1D 74ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 75ab213215SJeremy L Thompson 76ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 77ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 78ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 795c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 80053543deSYohann Dudouit inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 81ab213215SJeremy L Thompson if (data.tidx < P1d) { 824d537eeaSYohann const CeedInt node = data.tidx; 83ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 84ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 855c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 86241a4b83SYohann } 87241a4b83SYohann } 88920dcdc4Sjeremylt 89ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 90ab213215SJeremy L Thompson // L-vector -> E-vector, strided 91ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 92d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 93053543deSYohann Dudouit inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 94ab213215SJeremy L Thompson if (data.tidx < P1d) { 95ccedf6b0Sjeremylt const CeedInt node = data.tidx; 96d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 97ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 98d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 99ccedf6b0Sjeremylt } 100ccedf6b0Sjeremylt } 101241a4b83SYohann 102ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 103ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 104ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1055c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 106053543deSYohann Dudouit inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 107ab213215SJeremy L Thompson if (data.tidx < P1d) { 1084d537eeaSYohann const CeedInt node = data.tidx; 109ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 110ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 1115c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 112241a4b83SYohann } 113241a4b83SYohann } 114241a4b83SYohann 115ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 116ab213215SJeremy L Thompson // E-vector -> L-vector, strided 117ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 118d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 119d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 120ab213215SJeremy L Thompson if (data.tidx < P1d) { 121ccedf6b0Sjeremylt const CeedInt node = data.tidx; 122d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 123ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 124d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 125ccedf6b0Sjeremylt } 126ccedf6b0Sjeremylt } 127ccedf6b0Sjeremylt 128ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 129ab213215SJeremy L Thompson // 1D tensor contraction x 130ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 131241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 132ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 133241a4b83SYohann data.slice[data.tidx] = *U; 134241a4b83SYohann __syncthreads(); 135241a4b83SYohann *V = 0.0; 13618d499f1SYohann if (data.tidx < Q1d) 137ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 138ab213215SJeremy L Thompson *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 139241a4b83SYohann __syncthreads(); 140241a4b83SYohann } 141241a4b83SYohann 142ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 143ab213215SJeremy L Thompson // 1D transpose tensor contraction x 144ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 145241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 146ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 147241a4b83SYohann data.slice[data.tidx] = *U; 148241a4b83SYohann __syncthreads(); 149241a4b83SYohann *V = 0.0; 15018d499f1SYohann if (data.tidx < P1d) 151ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 152ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 153241a4b83SYohann __syncthreads(); 154241a4b83SYohann } 155241a4b83SYohann 156ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 157ab213215SJeremy L Thompson // 1D interpolate to quadrature points 158ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 159241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 160ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 161ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 162241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 163241a4b83SYohann } 164241a4b83SYohann 165ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 166ab213215SJeremy L Thompson // 1D interpolate transpose 167ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 168241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 169ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 170ab213215SJeremy L Thompson for (CeedInt comp=0; comp<NCOMP; comp++) 171241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 172241a4b83SYohann } 173241a4b83SYohann 174ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 175ab213215SJeremy L Thompson // 1D derivatives at quadrature points 176ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 177241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 178ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 179ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 180241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 181241a4b83SYohann } 182241a4b83SYohann 183ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 184ab213215SJeremy L Thompson // 1D derivatives transpose 185ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 186241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 187ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 188ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 189241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 190241a4b83SYohann } 191241a4b83SYohann 192ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 193241a4b83SYohann // 2D 194ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 195ab213215SJeremy L Thompson 196ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 197ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 198ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1995c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 200053543deSYohann Dudouit inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 201ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 2024d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 203ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 204ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2055c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 206241a4b83SYohann } 207241a4b83SYohann } 208920dcdc4Sjeremylt 209ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 210ab213215SJeremy L Thompson // L-vector -> E-vector, strided 211ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 212d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 213053543deSYohann Dudouit inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 214ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 215ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 216d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 217ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 218d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 219ccedf6b0Sjeremylt } 220ccedf6b0Sjeremylt } 221241a4b83SYohann 222ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 223ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 224ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 2255c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 226053543deSYohann Dudouit inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 227ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 2284d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 229ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 230ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2315c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 232241a4b83SYohann } 233241a4b83SYohann } 234241a4b83SYohann 235ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 236ab213215SJeremy L Thompson // E-vector -> L-vector, strided 237ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 238d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 239d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 240ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 241ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 242d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 243ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 244d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 245ccedf6b0Sjeremylt } 246ccedf6b0Sjeremylt } 247ccedf6b0Sjeremylt 248ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 249ab213215SJeremy L Thompson // 2D tensor contraction x 250ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 251241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 252ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 25318d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 254241a4b83SYohann __syncthreads(); 255241a4b83SYohann *V = 0.0; 25618d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 257ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 25818d499f1SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 259241a4b83SYohann __syncthreads(); 260241a4b83SYohann } 261241a4b83SYohann 262ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 263ab213215SJeremy L Thompson // 2D tensor contract y 264ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 265241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 266ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 26718d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 268241a4b83SYohann __syncthreads(); 269241a4b83SYohann *V = 0.0; 27018d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 271ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 27218d499f1SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 273241a4b83SYohann __syncthreads(); 274241a4b83SYohann } 275241a4b83SYohann 276ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 277ab213215SJeremy L Thompson // 2D transpose tensor contract y 278ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 279241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 280ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 28118d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 282241a4b83SYohann __syncthreads(); 283241a4b83SYohann *V = 0.0; 28418d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 285ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 28618d499f1SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 287241a4b83SYohann __syncthreads(); 288241a4b83SYohann } 289241a4b83SYohann 290ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 291ab213215SJeremy L Thompson // 2D transpose tensor contract x 292ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 293241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 294ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 29518d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 296241a4b83SYohann __syncthreads(); 297241a4b83SYohann *V = 0.0; 29818d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 299ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 30018d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 301241a4b83SYohann __syncthreads(); 302241a4b83SYohann } 303241a4b83SYohann 304ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 305ab213215SJeremy L Thompson // 2D transpose tensor contract and add x 306ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 307241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 308ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 30918d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 310241a4b83SYohann __syncthreads(); 31118d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 312ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 31318d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 314241a4b83SYohann __syncthreads(); 315241a4b83SYohann } 316241a4b83SYohann 317ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 318ab213215SJeremy L Thompson // 2D interpolate to quadrature points 319ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 320241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 321ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 322241a4b83SYohann CeedScalar r_t[1]; 323ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 324241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 325241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 326241a4b83SYohann } 327241a4b83SYohann } 328241a4b83SYohann 329ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 330ab213215SJeremy L Thompson // 2D interpolate transpose 331ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 332241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 333ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 334241a4b83SYohann CeedScalar r_t[1]; 335ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 336241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 337241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 338241a4b83SYohann } 339241a4b83SYohann } 340241a4b83SYohann 341ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 342ab213215SJeremy L Thompson // 2D derivatives at quadrature points 343ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 344241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 345ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 346241a4b83SYohann CeedScalar r_t[1]; 347ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 348241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 349241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 350241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 351241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 352241a4b83SYohann } 353241a4b83SYohann } 354241a4b83SYohann 355ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 356ab213215SJeremy L Thompson // 2D derivatives transpose 357ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 358241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 359ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 360241a4b83SYohann CeedScalar r_t[1]; 361ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 362241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 363241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 364241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 365241a4b83SYohann ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 366241a4b83SYohann } 367241a4b83SYohann } 368241a4b83SYohann 369ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 370241a4b83SYohann // 3D 371ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 372ab213215SJeremy L Thompson 373ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 374ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 375ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3765c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 377053543deSYohann Dudouit inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 378ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 379241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3804d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 381ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 382ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 3835c7b696cSjeremylt r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 384241a4b83SYohann } 385241a4b83SYohann } 386920dcdc4Sjeremylt 387ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 388ab213215SJeremy L Thompson // L-vector -> E-vector, strided 389ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 390d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 391053543deSYohann Dudouit inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 392ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 393ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 394ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 395d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 396ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 397d80fc06aSjeremylt r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 398ccedf6b0Sjeremylt } 399ccedf6b0Sjeremylt } 400241a4b83SYohann 401ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 402ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided 403ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4045c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d> 405053543deSYohann Dudouit inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 40618d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 407ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 408920dcdc4Sjeremylt const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 409ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4105c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 411ac421f39SYohann } 41218d499f1SYohann } 413ac421f39SYohann 414ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 415ab213215SJeremy L Thompson // E-vector -> Q-vector, strided 416ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 417d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 418053543deSYohann Dudouit inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 41918d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 420920dcdc4Sjeremylt const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 421d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 422ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 423d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 424920dcdc4Sjeremylt } 42518d499f1SYohann } 426920dcdc4Sjeremylt 427ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 428ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 429ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4305c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 431053543deSYohann Dudouit inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 432ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 433241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4344d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 435ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 436ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4375c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 438241a4b83SYohann } 439241a4b83SYohann } 440241a4b83SYohann 441ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 442ab213215SJeremy L Thompson // E-vector -> L-vector, strided 443ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 444d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4452f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 446ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 447ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 448ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 449d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 450ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 451d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 452ccedf6b0Sjeremylt } 453ccedf6b0Sjeremylt } 454ccedf6b0Sjeremylt 455ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 456ab213215SJeremy L Thompson // 3D tensor contract x 457ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 458241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 459ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 460ac421f39SYohann CeedScalar r_B[P1d]; 461ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 462ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 463ab213215SJeremy L Thompson 464ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 46518d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 466241a4b83SYohann __syncthreads(); 467241a4b83SYohann V[k] = 0.0; 46818d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 469ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 47018d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 471241a4b83SYohann __syncthreads(); 472241a4b83SYohann } 473241a4b83SYohann } 474241a4b83SYohann 475ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 476ab213215SJeremy L Thompson // 3D tensor contract y 477ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 478241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 479ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 480ac421f39SYohann CeedScalar r_B[P1d]; 481ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 482ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 483ab213215SJeremy L Thompson 484ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 48518d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 486241a4b83SYohann __syncthreads(); 487241a4b83SYohann V[k] = 0.0; 48818d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 489ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 49018d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 491241a4b83SYohann __syncthreads(); 492241a4b83SYohann } 493241a4b83SYohann } 494241a4b83SYohann 495ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 496ab213215SJeremy L Thompson // 3D tensor contract z 497ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 498241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 499ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 500ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 501241a4b83SYohann V[k] = 0.0; 50218d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 503ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 504ab213215SJeremy L Thompson V[k] += B[i + k*P1d] * U[i]; // Contract z direction 505241a4b83SYohann } 506241a4b83SYohann } 507241a4b83SYohann 508ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 509ab213215SJeremy L Thompson // 3D transpose tensor contract z 510ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 511241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 512ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 51318d499f1SYohann for (CeedInt k = 0; k < P1d; ++k) { 514241a4b83SYohann V[k] = 0.0; 51518d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 516ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 517ab213215SJeremy L Thompson V[k] += B[k + i*P1d] * U[i]; // Contract z direction 518241a4b83SYohann } 519241a4b83SYohann } 520241a4b83SYohann 521ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 522ab213215SJeremy L Thompson // 3D transpose tensor contract y 523ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 524241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 525ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 526ac421f39SYohann CeedScalar r_B[Q1d]; 527ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 528ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 529ab213215SJeremy L Thompson 530ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 53118d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 532241a4b83SYohann __syncthreads(); 533241a4b83SYohann V[k] = 0.0; 53418d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 535ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 53618d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 537ac421f39SYohann __syncthreads(); 538ac421f39SYohann } 539ac421f39SYohann } 540ac421f39SYohann 541ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 542ab213215SJeremy L Thompson // 3D transpose tensor contract add y 543ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 544ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 545ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 546ac421f39SYohann CeedScalar r_B[Q1d]; 547ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 548ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 549ab213215SJeremy L Thompson 550ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 55118d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 552ac421f39SYohann __syncthreads(); 55318d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 554ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 55518d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 556241a4b83SYohann __syncthreads(); 557241a4b83SYohann } 558241a4b83SYohann } 559241a4b83SYohann 560ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 561ab213215SJeremy L Thompson // 3D transpose tensor contract x 562ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 563241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 564ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 565ac421f39SYohann CeedScalar r_B[Q1d]; 566ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 567ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 568ab213215SJeremy L Thompson 569ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 57018d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 571241a4b83SYohann __syncthreads(); 572241a4b83SYohann V[k] = 0.0; 57318d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 574ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 57518d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 576241a4b83SYohann __syncthreads(); 577241a4b83SYohann } 578241a4b83SYohann } 579241a4b83SYohann 580ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 581ab213215SJeremy L Thompson // 3D transpose tensor contract add x 582ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 583241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 584ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 585ac421f39SYohann CeedScalar r_B[Q1d]; 586ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 587ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 588ab213215SJeremy L Thompson 589ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 59018d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 591241a4b83SYohann __syncthreads(); 59218d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 593ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 59418d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 595241a4b83SYohann __syncthreads(); 596241a4b83SYohann } 597241a4b83SYohann } 598241a4b83SYohann 599ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 600ab213215SJeremy L Thompson // 3D interpolate to quadrature points 601ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 602241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 603ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 60418d499f1SYohann CeedScalar r_t1[T1d]; 60518d499f1SYohann CeedScalar r_t2[T1d]; 606ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 607241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 608241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 609241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 610241a4b83SYohann } 611241a4b83SYohann } 612241a4b83SYohann 613ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 614ab213215SJeremy L Thompson // 3D interpolate transpose 615ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 616241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 617ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 61818d499f1SYohann CeedScalar r_t1[T1d]; 61918d499f1SYohann CeedScalar r_t2[T1d]; 620ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 621241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 622241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 623241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 624241a4b83SYohann } 625241a4b83SYohann } 626241a4b83SYohann 627ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 628ab213215SJeremy L Thompson // 3D derivatives at quadrature points 629ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 630241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 631ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 63218d499f1SYohann CeedScalar r_t1[T1d]; 63318d499f1SYohann CeedScalar r_t2[T1d]; 634ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 635241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 636241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 637241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 638241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 639241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 640241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 641241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 642241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 643241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 644241a4b83SYohann } 645241a4b83SYohann } 646241a4b83SYohann 647ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 648ab213215SJeremy L Thompson // 3D derivatives transpose 649ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 650241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 651ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 65218d499f1SYohann CeedScalar r_t1[T1d]; 65318d499f1SYohann CeedScalar r_t2[T1d]; 654ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 655241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 656241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 657241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 658241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 659241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 660241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 661241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 662241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 663241a4b83SYohann ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 664241a4b83SYohann } 665241a4b83SYohann } 666241a4b83SYohann 667ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 668ab213215SJeremy L Thompson // 3D collocated derivatives computation 669ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 670ac421f39SYohann template <int NCOMP, int Q1d> 671ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 67218d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 673ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 67418d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 675ac421f39SYohann __syncthreads(); 676ac421f39SYohann // X derivative 677ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 678ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 67918d499f1SYohann r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 680ac421f39SYohann // Y derivative 681ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 682ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 68318d499f1SYohann r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 684ac421f39SYohann // Z derivative 685ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 686ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 687ab213215SJeremy L Thompson r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 688ac421f39SYohann __syncthreads(); 689ac421f39SYohann } 690ac421f39SYohann } 69118d499f1SYohann } 692ac421f39SYohann 693ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 694ab213215SJeremy L Thompson // 3D collocated derivatives transpose 695ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 696ac421f39SYohann template <int NCOMP, int Q1d> 697ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 69818d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 699ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 700ac421f39SYohann // X derivative 70118d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 702ac421f39SYohann __syncthreads(); 703ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 70418d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 705ac421f39SYohann __syncthreads(); 706ac421f39SYohann // Y derivative 70718d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 708ac421f39SYohann __syncthreads(); 709ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 71018d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 711ac421f39SYohann __syncthreads(); 712ac421f39SYohann // Z derivative 713ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 714ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 715ac421f39SYohann } 716ac421f39SYohann } 71718d499f1SYohann } 718ac421f39SYohann 719ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 720ab213215SJeremy L Thompson // 1D quadrature weights 721ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 722241a4b83SYohann template <int Q1d> 723053543deSYohann Dudouit inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 72418d499f1SYohann *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 725241a4b83SYohann } 726241a4b83SYohann 727ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 728ab213215SJeremy L Thompson // 2D quadrature weights 729ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 730241a4b83SYohann template <int Q1d> 731053543deSYohann Dudouit inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 73218d499f1SYohann *w = (data.tidx < Q1d && data.tidy < Q1d) ? 73318d499f1SYohann qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 734241a4b83SYohann } 735241a4b83SYohann 736ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 737ab213215SJeremy L Thompson // 3D quadrature weights 738ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 739241a4b83SYohann template <int Q1d> 740053543deSYohann Dudouit inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 74118d499f1SYohann const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 74218d499f1SYohann const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 743ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 74418d499f1SYohann w[z] = quad ? pw*qweight1d[z] : 0.0; 745241a4b83SYohann } 746241a4b83SYohann 747241a4b83SYohann ); 748ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 749ab213215SJeremy L Thompson // Build singe operator kernel 750ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 751241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 752241a4b83SYohann 753241a4b83SYohann using std::ostringstream; 754241a4b83SYohann using std::string; 755241a4b83SYohann int ierr; 756241a4b83SYohann bool setupdone; 757e15f9bd0SJeremy L Thompson ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 758e15f9bd0SJeremy L Thompson if (setupdone) return CEED_ERROR_SUCCESS; 759241a4b83SYohann Ceed ceed; 760e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 761241a4b83SYohann CeedOperator_Cuda_gen *data; 762e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 763241a4b83SYohann CeedQFunction qf; 764241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 765e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 766e15f9bd0SJeremy L Thompson ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 767*88db6d3bSJeremy L Thompson CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields, 7685c7b696cSjeremylt numoutputfields, ncomp, dim = 0, lsize; 769e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 770e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 771241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 7727e7773b5SJeremy L Thompson ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields); 773e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 774241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 7757e7773b5SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 776e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 777241a4b83SYohann CeedEvalMode emode; 778241a4b83SYohann CeedBasis basis; 779241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 780241a4b83SYohann CeedElemRestriction Erestrict; 781461525f5SNatalie Beams CeedElemRestriction_Cuda *restr_data; 782241a4b83SYohann 7830b454692Sjeremylt // Check for restriction only identity operator 7840b454692Sjeremylt bool is_identity_qf; 7850b454692Sjeremylt ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 7860b454692Sjeremylt if (is_identity_qf) { 7870b454692Sjeremylt CeedEvalMode emodein, emodeout; 7880b454692Sjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein); CeedChkBackend(ierr); 7890b454692Sjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout); CeedChkBackend(ierr); 7900b454692Sjeremylt if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE) 7910b454692Sjeremylt // LCOV_EXCL_START 7920b454692Sjeremylt return CeedError(ceed, CEED_ERROR_BACKEND, 7930b454692Sjeremylt "Backend does not implement restriction only identity operators"); 7940b454692Sjeremylt // LCOV_EXCL_STOP 7950b454692Sjeremylt } 7960b454692Sjeremylt 797241a4b83SYohann ostringstream code; 798241a4b83SYohann string devFunctions(deviceFunctions); 799241a4b83SYohann 800f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 801f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 802f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 803*88db6d3bSJeremy L Thompson ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr); 804*88db6d3bSJeremy L Thompson ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); CeedChkBackend(ierr); 80580a9ef05SNatalie Beams if ((prop.major<6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){ 806f1a13f77SYohann Dudouit code << atomicAdd; 807f1a13f77SYohann Dudouit } 808f1a13f77SYohann Dudouit 809241a4b83SYohann code << devFunctions; 810241a4b83SYohann 811241a4b83SYohann string qFunction(qf_data->qFunctionSource); 812c3c443faSYohann Dudouit string qFunctionName(qf_data->qFunctionName); 813c3c443faSYohann Dudouit string oper; 81414cce66cSYohann Dudouit oper = "CeedKernel_Cuda_gen_" + qFunctionName; 8154d537eeaSYohann 8164d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 81777841947SLeila Ghaffari code << "#define CEED_QFUNCTION_HELPER inline __device__\n"; 818e15f9bd0SJeremy L Thompson code << "#define CeedPragmaSIMD\n"; 819e15f9bd0SJeremy L Thompson code << "#define CEED_ERROR_SUCCESS 0\n\n"; 8201da99368SJeremy L Thompson 8211da99368SJeremy L Thompson // Find dim and Q1d 82264d3f0c0Sjeremylt bool useCollograd = true; 82318d499f1SYohann data->maxP1d = 0; 8241da99368SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 825e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 8261da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 827e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 8280f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 829e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8301da99368SJeremy L Thompson 8311da99368SJeremy L Thompson // Check for collocated gradient 83218d499f1SYohann useCollograd = useCollograd && basis_data->d_collograd1d; 8331da99368SJeremy L Thompson 8341da99368SJeremy L Thompson // Collect dim and Q1d 835e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 8361da99368SJeremy L Thompson bool isTensor; 837e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 8381da99368SJeremy L Thompson if (isTensor) { 839e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 840e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 84118d499f1SYohann if (P1d>data->maxP1d) data->maxP1d = P1d; 8421da99368SJeremy L Thompson } else { 843e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 844e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 845e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8461da99368SJeremy L Thompson } 8471da99368SJeremy L Thompson } 8481da99368SJeremy L Thompson } 8491da99368SJeremy L Thompson // Check output bases for Q1d, dim as well 8501da99368SJeremy L Thompson // The only imput basis might be CEED_BASIS_COLLOCATED 8511da99368SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 852e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 8530f54b25eSjeremylt 8541da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 855e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 8560f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 857e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8580f54b25eSjeremylt 8591da99368SJeremy L Thompson // Collect dim and Q1d 860e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 8611da99368SJeremy L Thompson bool isTensor; 862e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 8631da99368SJeremy L Thompson if (isTensor) { 864e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 8651da99368SJeremy L Thompson } else { 866e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 867e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 868e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8691da99368SJeremy L Thompson } 8700f54b25eSjeremylt 8710f54b25eSjeremylt // Check for collocated gradient 87264d3f0c0Sjeremylt useCollograd = useCollograd && basis_data->d_collograd1d; 8731da99368SJeremy L Thompson } 8741da99368SJeremy L Thompson } 8751da99368SJeremy L Thompson data->dim = dim; 8761da99368SJeremy L Thompson data->Q1d = Q1d; 8771da99368SJeremy L Thompson 8781da99368SJeremy L Thompson // Define CEED_Q_VLA 87964d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 8801da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA 1\n\n"; 8811da99368SJeremy L Thompson } else { 8821da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 8831da99368SJeremy L Thompson } 8841da99368SJeremy L Thompson 885241a4b83SYohann code << qFunction; 886241a4b83SYohann 887241a4b83SYohann // Setup 888818e0025SJeremy L Thompson code << "\n// -----------------------------------------------------------------------------\n"; 889c3c443faSYohann Dudouit code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 890241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 891241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 892e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8931da99368SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 894241a4b83SYohann code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 895241a4b83SYohann } 896241a4b83SYohann } 897241a4b83SYohann 898241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 899241a4b83SYohann code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 900241a4b83SYohann } 9011da99368SJeremy L Thompson 902241a4b83SYohann code << " const CeedInt Dim = "<<dim<<";\n"; 903241a4b83SYohann code << " const CeedInt Q1d = "<<Q1d<<";\n"; 9041da99368SJeremy L Thompson 905241a4b83SYohann code << " extern __shared__ CeedScalar slice[];\n"; 906241a4b83SYohann code << " BackendData data;\n"; 907241a4b83SYohann code << " data.tidx = threadIdx.x;\n"; 908241a4b83SYohann code << " data.tidy = threadIdx.y;\n"; 909241a4b83SYohann code << " data.tidz = threadIdx.z;\n"; 910241a4b83SYohann code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 91118d499f1SYohann code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 912920dcdc4Sjeremylt 913818e0025SJeremy L Thompson code << "\n // -- Input field constants and basis data --\n"; 914ac421f39SYohann //Initialize constants, and matrices B and G 915241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 916818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 917241a4b83SYohann // Get elemsize, emode, ncomp 918241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 919e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 920241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 921e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 922241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 923e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9244d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 925e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 926920dcdc4Sjeremylt 927920dcdc4Sjeremylt // Set field constants 928920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT) { 929e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 930920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 931e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 932920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 933920dcdc4Sjeremylt } else { 934920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 935920dcdc4Sjeremylt } 936920dcdc4Sjeremylt code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 937920dcdc4Sjeremylt } 938920dcdc4Sjeremylt 939920dcdc4Sjeremylt // Load basis data 940920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 941241a4b83SYohann switch (emode) { 942241a4b83SYohann case CEED_EVAL_NONE: 943241a4b83SYohann break; 944241a4b83SYohann case CEED_EVAL_INTERP: 945e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 946241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 94780a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 948241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 949241a4b83SYohann break; 950241a4b83SYohann case CEED_EVAL_GRAD: 951e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 952241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 95380a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 954241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 95564d3f0c0Sjeremylt if (useCollograd) { 956ac421f39SYohann data->G.in[i] = basis_data->d_collograd1d; 95780a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 958ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 959ac421f39SYohann } else { 960ac421f39SYohann data->G.in[i] = basis_data->d_grad1d; 96180a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 962241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 963ac421f39SYohann } 964241a4b83SYohann break; 965241a4b83SYohann case CEED_EVAL_WEIGHT: 966241a4b83SYohann break; // No action 967241a4b83SYohann case CEED_EVAL_DIV: 968241a4b83SYohann break; // TODO: Not implemented 969241a4b83SYohann case CEED_EVAL_CURL: 970241a4b83SYohann break; // TODO: Not implemented 971241a4b83SYohann } 972241a4b83SYohann } 973920dcdc4Sjeremylt 974818e0025SJeremy L Thompson code << "\n // -- Output field constants and basis data --\n"; 975241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 976818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 977241a4b83SYohann // Get elemsize, emode, ncomp 978241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 979e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 980241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 981e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 982241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 983e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9844d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 985e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 986920dcdc4Sjeremylt 987920dcdc4Sjeremylt // Set field constants 988e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 989920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 990e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 991241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 992920dcdc4Sjeremylt } else { 993920dcdc4Sjeremylt code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 994920dcdc4Sjeremylt } 995920dcdc4Sjeremylt code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 996920dcdc4Sjeremylt 997920dcdc4Sjeremylt // Load basis data 998920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 999920dcdc4Sjeremylt switch (emode) { 1000920dcdc4Sjeremylt case CEED_EVAL_NONE: 1001920dcdc4Sjeremylt break; // No action 1002920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1003e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1004241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 100580a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1006241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1007241a4b83SYohann break; 1008241a4b83SYohann case CEED_EVAL_GRAD: 1009e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1010241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 101180a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1012241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 101364d3f0c0Sjeremylt if (useCollograd) { 1014ac421f39SYohann data->G.out[i] = basis_data->d_collograd1d; 101580a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1016ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1017ac421f39SYohann } else { 1018ac421f39SYohann data->G.out[i] = basis_data->d_grad1d; 101980a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1020241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1021ac421f39SYohann } 1022241a4b83SYohann break; 1023e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1024241a4b83SYohann case CEED_EVAL_WEIGHT: { 1025241a4b83SYohann Ceed ceed; 1026e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1027e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1028241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1029241a4b83SYohann break; // Should not occur 1030241a4b83SYohann } 1031241a4b83SYohann case CEED_EVAL_DIV: 1032241a4b83SYohann break; // TODO: Not implemented 1033241a4b83SYohann case CEED_EVAL_CURL: 1034241a4b83SYohann break; // TODO: Not implemented 1035e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1036241a4b83SYohann } 1037241a4b83SYohann } 1038818e0025SJeremy L Thompson code << "\n // -- Element loop --\n"; 1039ac421f39SYohann code << " __syncthreads();\n"; 1040241a4b83SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1041241a4b83SYohann // Input basis apply if needed 1042ac421f39SYohann // Generate the correct eval mode code for each input 1043818e0025SJeremy L Thompson code << " // -- Input field restrictions and basis actions --\n"; 1044241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1045818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1046241a4b83SYohann // Get elemsize, emode, ncomp 1047241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1048e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1049241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1050e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1051241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1052e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10534d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1054e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1055920dcdc4Sjeremylt 1056920dcdc4Sjeremylt // Restriction 1057920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT && 105864d3f0c0Sjeremylt !((emode == CEED_EVAL_NONE) && useCollograd)) { 1059241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 10609a2291e3SJeremy L Thompson 10619a2291e3SJeremy L Thompson bool isStrided; 1062e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 10639a2291e3SJeremy L Thompson if (!isStrided) { 10645c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1065e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10665c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 10675c7b696cSjeremylt CeedInt compstride; 1068e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 10695c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1070e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 10719a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 10725c7b696cSjeremylt code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1073ccedf6b0Sjeremylt } else { 1074920dcdc4Sjeremylt bool backendstrides; 1075fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1076e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 107713585805SJeremy L Thompson CeedInt nelem; 107813585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1079e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 108013585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1081920dcdc4Sjeremylt if (!backendstrides) { 1082920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1083e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1084ccedf6b0Sjeremylt } 1085920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1086d80fc06aSjeremylt code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1087920dcdc4Sjeremylt } 1088920dcdc4Sjeremylt } 1089920dcdc4Sjeremylt 1090920dcdc4Sjeremylt // Basis action 1091920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1092920dcdc4Sjeremylt switch (emode) { 1093920dcdc4Sjeremylt case CEED_EVAL_NONE: 109464d3f0c0Sjeremylt if (!useCollograd) { 1095920dcdc4Sjeremylt code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1096920dcdc4Sjeremylt } 1097920dcdc4Sjeremylt break; 1098920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1099241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1100241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1101241a4b83SYohann break; 1102241a4b83SYohann case CEED_EVAL_GRAD: 110364d3f0c0Sjeremylt if (useCollograd) { 1104ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1105ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1106ac421f39SYohann } else { 1107241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1108241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1109ac421f39SYohann } 1110241a4b83SYohann break; 1111241a4b83SYohann case CEED_EVAL_WEIGHT: 1112241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1113e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 1114e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1115241a4b83SYohann data->W = basis_data->d_qweight1d; 1116241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1117241a4b83SYohann break; // No action 1118241a4b83SYohann case CEED_EVAL_DIV: 1119241a4b83SYohann break; // TODO: Not implemented 1120241a4b83SYohann case CEED_EVAL_CURL: 1121241a4b83SYohann break; // TODO: Not implemented 1122241a4b83SYohann } 1123241a4b83SYohann } 1124ac421f39SYohann 1125241a4b83SYohann // Q function 1126818e0025SJeremy L Thompson code << "\n // -- Output field setup --\n"; 1127241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1128818e0025SJeremy L Thompson code << "\n // ---- Output field "<<i<<" ----\n"; 1129241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1130e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1131241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1132241a4b83SYohann { 113364d3f0c0Sjeremylt if (useCollograd) { 1134ac421f39SYohann //Accumulator for gradient slices 1135ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1136ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1137ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1138ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1139ac421f39SYohann code << " }\n"; 1140ac421f39SYohann code << " }\n"; 1141ac421f39SYohann } else { 1142241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1143241a4b83SYohann } 1144ac421f39SYohann } 1145241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1146241a4b83SYohann { 1147241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1148241a4b83SYohann } 1149241a4b83SYohann } 1150ac421f39SYohann // We treat quadrature points per slice in 3d to save registers 115164d3f0c0Sjeremylt if (useCollograd) { 1152920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1153ac421f39SYohann code << "#pragma unroll\n"; 1154ac421f39SYohann code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1155818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1156ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1157818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1158ac421f39SYohann // Get elemsize, emode, ncomp 1159ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1160e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1161ac421f39SYohann // Basis action 1162920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1163ac421f39SYohann switch (emode) { 1164ac421f39SYohann case CEED_EVAL_NONE: 1165ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 11669a2291e3SJeremy L Thompson 11679a2291e3SJeremy L Thompson bool isStrided; 1168e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr); 1169e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr); 1170e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 11719a2291e3SJeremy L Thompson if (!isStrided) { 11725c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1173e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 11745c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 11755c7b696cSjeremylt CeedInt compstride; 1176e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 11775c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1178e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 11799a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 11805c7b696cSjeremylt code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1181920dcdc4Sjeremylt } else { 1182920dcdc4Sjeremylt bool backendstrides; 1183fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1184e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 118513585805SJeremy L Thompson CeedInt nelem; 118613585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1187e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 118813585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1189920dcdc4Sjeremylt if (!backendstrides) { 1190920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1191e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1192920dcdc4Sjeremylt } 1193920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1194d80fc06aSjeremylt code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1195920dcdc4Sjeremylt } 1196ac421f39SYohann break; 1197ac421f39SYohann case CEED_EVAL_INTERP: 1198ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1199ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1200ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1201ac421f39SYohann code << " }\n"; 1202ac421f39SYohann break; 1203ac421f39SYohann case CEED_EVAL_GRAD: 1204ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1205ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1206ac421f39SYohann break; 1207ac421f39SYohann case CEED_EVAL_WEIGHT: 1208ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1209ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1210ac421f39SYohann break; // No action 1211ac421f39SYohann case CEED_EVAL_DIV: 1212ac421f39SYohann break; // TODO: Not implemented 1213ac421f39SYohann case CEED_EVAL_CURL: 1214ac421f39SYohann break; // TODO: Not implemented 1215ac421f39SYohann } 1216ac421f39SYohann } 1217818e0025SJeremy L Thompson code << "\n // -- Output fields --\n"; 1218ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1219818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1220ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1221e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1222ac421f39SYohann // Basis action 1223ac421f39SYohann switch (emode) { 1224ac421f39SYohann case CEED_EVAL_NONE: 1225ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1226ac421f39SYohann break; // No action 1227ac421f39SYohann case CEED_EVAL_INTERP: 1228ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1229ac421f39SYohann break; 1230ac421f39SYohann case CEED_EVAL_GRAD: 1231ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1232ac421f39SYohann break; 1233ac421f39SYohann case CEED_EVAL_WEIGHT: 1234ac421f39SYohann break; // Should not occur 1235ac421f39SYohann case CEED_EVAL_DIV: 1236ac421f39SYohann break; // TODO: Not implemented 1237ac421f39SYohann case CEED_EVAL_CURL: 1238ac421f39SYohann break; // TODO: Not implemented 1239ac421f39SYohann } 1240ac421f39SYohann } 1241ac421f39SYohann } else { 1242920dcdc4Sjeremylt code << "\n // Note: No Collocated Gradient\n"; 1243818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1244ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1245818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1246ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1247ac421f39SYohann } 1248818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1249ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1250818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1251ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1252ac421f39SYohann } 1253ac421f39SYohann } 1254818e0025SJeremy L Thompson code << "\n // -- QFunction Inputs and outputs --\n"; 12554d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 12564d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1257818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1258ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 12594d537eeaSYohann } 12604d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 12614d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1262818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1263ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 12644d537eeaSYohann } 1265818e0025SJeremy L Thompson code << "\n // -- Apply QFunction --\n"; 1266ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 126764d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 1268ac421f39SYohann code << "1"; 1269ac421f39SYohann } else { 1270ac421f39SYohann code << "Q1d"; 1271ac421f39SYohann } 1272ac421f39SYohann code << ", in, out);\n"; 127364d3f0c0Sjeremylt if (useCollograd) { 1274920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1275818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1276ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1277818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1278ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1279e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1280ac421f39SYohann // Basis action 1281920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1282ac421f39SYohann switch (emode) { 1283ac421f39SYohann case CEED_EVAL_NONE: 1284ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1285ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1286ac421f39SYohann code << " }\n"; 1287ac421f39SYohann break; // No action 1288ac421f39SYohann case CEED_EVAL_INTERP: 1289ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1290ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1291ac421f39SYohann code << " }\n"; 1292ac421f39SYohann break; 1293ac421f39SYohann case CEED_EVAL_GRAD: 1294ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1295ac421f39SYohann break; 1296ac421f39SYohann case CEED_EVAL_WEIGHT: 1297ac421f39SYohann break; // Should not occur 1298ac421f39SYohann case CEED_EVAL_DIV: 1299ac421f39SYohann break; // TODO: Not implemented 1300ac421f39SYohann case CEED_EVAL_CURL: 1301ac421f39SYohann break; // TODO: Not implemented 1302ac421f39SYohann } 1303ac421f39SYohann } 1304ac421f39SYohann code << " }\n"; 1305ac421f39SYohann } 1306241a4b83SYohann 1307241a4b83SYohann // Output basis apply if needed 1308ac421f39SYohann // Generate the correct eval mode code for each output 1309818e0025SJeremy L Thompson code << "\n // -- Output field basis action and restrictions --\n"; 1310241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1311818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1312241a4b83SYohann // Get elemsize, emode, ncomp 1313241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1314e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1315241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1316e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1317241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1318e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13194d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1320e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1321241a4b83SYohann // Basis action 1322920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1323241a4b83SYohann switch (emode) { 1324241a4b83SYohann case CEED_EVAL_NONE: 1325920dcdc4Sjeremylt code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1326241a4b83SYohann break; // No action 1327241a4b83SYohann case CEED_EVAL_INTERP: 1328241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1329241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1330241a4b83SYohann break; 1331241a4b83SYohann case CEED_EVAL_GRAD: 1332241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 133364d3f0c0Sjeremylt if (useCollograd) { 1334ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1335ac421f39SYohann } else { 1336241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1337ac421f39SYohann } 1338241a4b83SYohann break; 1339e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1340241a4b83SYohann case CEED_EVAL_WEIGHT: { 1341241a4b83SYohann Ceed ceed; 1342e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1343e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1344241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1345241a4b83SYohann break; // Should not occur 1346241a4b83SYohann } 1347241a4b83SYohann case CEED_EVAL_DIV: 1348241a4b83SYohann break; // TODO: Not implemented 1349241a4b83SYohann case CEED_EVAL_CURL: 1350241a4b83SYohann break; // TODO: Not implemented 1351e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1352241a4b83SYohann } 1353920dcdc4Sjeremylt // Restriction 13549a2291e3SJeremy L Thompson bool isStrided; 1355e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 13569a2291e3SJeremy L Thompson if (!isStrided) { 13575c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1358e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13595c7b696cSjeremylt code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 13605c7b696cSjeremylt CeedInt compstride; 1361e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 13625c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1363e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 13649a2291e3SJeremy L Thompson data->indices.out[i] = restr_data->d_ind; 13655c7b696cSjeremylt code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1366920dcdc4Sjeremylt } else { 1367920dcdc4Sjeremylt bool backendstrides; 1368fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1369e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 137013585805SJeremy L Thompson CeedInt nelem; 137113585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1372e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 137313585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1374920dcdc4Sjeremylt if (!backendstrides) { 1375920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1376e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1377920dcdc4Sjeremylt } 1378920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1379d80fc06aSjeremylt code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1380920dcdc4Sjeremylt } 1381241a4b83SYohann } 1382241a4b83SYohann 1383241a4b83SYohann code << " }\n"; 1384818e0025SJeremy L Thompson code << "}\n"; 1385818e0025SJeremy L Thompson code << "// -----------------------------------------------------------------------------\n\n"; 1386241a4b83SYohann 1387ab213215SJeremy L Thompson // View kernel for debugging 1388b8e71988SJeremy L Thompson CeedDebug(code.str().c_str()); 1389241a4b83SYohann 139018d499f1SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 139118d499f1SYohann "T1d", CeedIntMax(Q1d, data->maxP1d)); 1392e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1393c3c443faSYohann Dudouit ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1394e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1395241a4b83SYohann 1396e15f9bd0SJeremy L Thompson ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 1397e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1398241a4b83SYohann } 1399ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1400