1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 163d576824SJeremy L Thompson 17b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12 187df94212SJeremy L Thompson 19ec3da8bcSJed Brown #include <ceed/ceed.h> 20ec3da8bcSJed Brown #include <ceed/backend.h> 213d576824SJeremy L Thompson #include <cuda_runtime.h> 22241a4b83SYohann #include <iostream> 23241a4b83SYohann #include <sstream> 243d576824SJeremy L Thompson #include "ceed-cuda-gen.h" 256d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 260d0321e0SJeremy L Thompson #include "../cuda-ref/ceed-cuda-ref.h" 27241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 28241a4b83SYohann 29f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 30ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 31ab213215SJeremy L Thompson // Atomic add, for older CUDA 32ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3380a9ef05SNatalie Beams __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) { 34241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 35241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 36241a4b83SYohann do { 37241a4b83SYohann assumed = old; 38241a4b83SYohann old = 39241a4b83SYohann atomicCAS(address_as_ull, assumed, 40241a4b83SYohann __double_as_longlong(val + 41241a4b83SYohann __longlong_as_double(assumed))); 42241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 43241a4b83SYohann // (since NaN != NaN) 44241a4b83SYohann } while (assumed != old); 45241a4b83SYohann return __longlong_as_double(old); 46241a4b83SYohann } 47f1a13f77SYohann Dudouit ); 48f1a13f77SYohann Dudouit 49f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 50f1a13f77SYohann Dudouit 51ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 52ab213215SJeremy L Thompson // Typedefs 53ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 54f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 55f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 56f1a13f77SYohann Dudouit 57f1a13f77SYohann Dudouit typedef struct { 58f1a13f77SYohann Dudouit CeedInt tidx; 59f1a13f77SYohann Dudouit CeedInt tidy; 60f1a13f77SYohann Dudouit CeedInt tidz; 61f1a13f77SYohann Dudouit CeedInt tid; 62f1a13f77SYohann Dudouit CeedScalar* slice; 63f1a13f77SYohann Dudouit } BackendData; 64241a4b83SYohann 65ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 66ab213215SJeremy L Thompson // Load matrices for basis actions 67ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 68241a4b83SYohann template <int P, int Q> 69053543deSYohann Dudouit inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 70ab213215SJeremy L Thompson for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 71241a4b83SYohann B[i] = d_B[i]; 72241a4b83SYohann } 73241a4b83SYohann 74ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 75241a4b83SYohann // 1D 76ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 77ab213215SJeremy L Thompson 78ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 79ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 80ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 815c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 82053543deSYohann Dudouit inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 83ab213215SJeremy L Thompson if (data.tidx < P1d) { 844d537eeaSYohann const CeedInt node = data.tidx; 85ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 86ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 875c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 88241a4b83SYohann } 89241a4b83SYohann } 90920dcdc4Sjeremylt 91ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 92ab213215SJeremy L Thompson // L-vector -> E-vector, strided 93ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 94d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 95053543deSYohann Dudouit inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 96ab213215SJeremy L Thompson if (data.tidx < P1d) { 97ccedf6b0Sjeremylt const CeedInt node = data.tidx; 98d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 99ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 100d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 101ccedf6b0Sjeremylt } 102ccedf6b0Sjeremylt } 103241a4b83SYohann 104ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 105ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 106ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1075c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 108053543deSYohann Dudouit inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 109ab213215SJeremy L Thompson if (data.tidx < P1d) { 1104d537eeaSYohann const CeedInt node = data.tidx; 111ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 112ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 1135c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 114241a4b83SYohann } 115241a4b83SYohann } 116241a4b83SYohann 117ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 118ab213215SJeremy L Thompson // E-vector -> L-vector, strided 119ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 120d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 121d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 122ab213215SJeremy L Thompson if (data.tidx < P1d) { 123ccedf6b0Sjeremylt const CeedInt node = data.tidx; 124d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 125ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 126d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 127ccedf6b0Sjeremylt } 128ccedf6b0Sjeremylt } 129ccedf6b0Sjeremylt 130ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 131ab213215SJeremy L Thompson // 1D tensor contraction x 132ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 133241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 134ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 135241a4b83SYohann data.slice[data.tidx] = *U; 136241a4b83SYohann __syncthreads(); 137241a4b83SYohann *V = 0.0; 13818d499f1SYohann if (data.tidx < Q1d) 139ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 140ab213215SJeremy L Thompson *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 141241a4b83SYohann __syncthreads(); 142241a4b83SYohann } 143241a4b83SYohann 144ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 145ab213215SJeremy L Thompson // 1D transpose tensor contraction x 146ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 147241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 148ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 149241a4b83SYohann data.slice[data.tidx] = *U; 150241a4b83SYohann __syncthreads(); 151241a4b83SYohann *V = 0.0; 15218d499f1SYohann if (data.tidx < P1d) 153ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 154ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 155241a4b83SYohann __syncthreads(); 156241a4b83SYohann } 157241a4b83SYohann 158ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 159ab213215SJeremy L Thompson // 1D interpolate to quadrature points 160ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 161241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 162ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 163ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 164241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 165241a4b83SYohann } 166241a4b83SYohann 167ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 168ab213215SJeremy L Thompson // 1D interpolate transpose 169ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 170241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 171ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 172ab213215SJeremy L Thompson for (CeedInt comp=0; comp<NCOMP; comp++) 173241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 174241a4b83SYohann } 175241a4b83SYohann 176ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 177ab213215SJeremy L Thompson // 1D derivatives at quadrature points 178ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 179241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 180ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 181ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 182241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 183241a4b83SYohann } 184241a4b83SYohann 185ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 186ab213215SJeremy L Thompson // 1D derivatives transpose 187ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 188241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 189ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 190ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 191241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 192241a4b83SYohann } 193241a4b83SYohann 194ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 195241a4b83SYohann // 2D 196ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 197ab213215SJeremy L Thompson 198ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 199ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 200ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 2015c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 202053543deSYohann Dudouit inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 203ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 2044d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 205ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 206ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2075c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 208241a4b83SYohann } 209241a4b83SYohann } 210920dcdc4Sjeremylt 211ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 212ab213215SJeremy L Thompson // L-vector -> E-vector, strided 213ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 214d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 215053543deSYohann Dudouit inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 216ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 217ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 218d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 219ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 220d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 221ccedf6b0Sjeremylt } 222ccedf6b0Sjeremylt } 223241a4b83SYohann 224ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 225ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 226ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 2275c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 228053543deSYohann Dudouit inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 229ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 2304d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 231ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 232ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2335c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 234241a4b83SYohann } 235241a4b83SYohann } 236241a4b83SYohann 237ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 238ab213215SJeremy L Thompson // E-vector -> L-vector, strided 239ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 240d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 241d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 242ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 243ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 244d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 245ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 246d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 247ccedf6b0Sjeremylt } 248ccedf6b0Sjeremylt } 249ccedf6b0Sjeremylt 250ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 251ab213215SJeremy L Thompson // 2D tensor contraction x 252ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 253241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 254ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 25518d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 256241a4b83SYohann __syncthreads(); 257241a4b83SYohann *V = 0.0; 25818d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 259ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 26018d499f1SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 261241a4b83SYohann __syncthreads(); 262241a4b83SYohann } 263241a4b83SYohann 264ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 265ab213215SJeremy L Thompson // 2D tensor contract y 266ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 267241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 268ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 26918d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 270241a4b83SYohann __syncthreads(); 271241a4b83SYohann *V = 0.0; 27218d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 273ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 27418d499f1SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 275241a4b83SYohann __syncthreads(); 276241a4b83SYohann } 277241a4b83SYohann 278ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 279ab213215SJeremy L Thompson // 2D transpose tensor contract y 280ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 281241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 282ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 28318d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 284241a4b83SYohann __syncthreads(); 285241a4b83SYohann *V = 0.0; 28618d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 287ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 28818d499f1SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 289241a4b83SYohann __syncthreads(); 290241a4b83SYohann } 291241a4b83SYohann 292ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 293ab213215SJeremy L Thompson // 2D transpose tensor contract x 294ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 295241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 296ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 29718d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 298241a4b83SYohann __syncthreads(); 299241a4b83SYohann *V = 0.0; 30018d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 301ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 30218d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 303241a4b83SYohann __syncthreads(); 304241a4b83SYohann } 305241a4b83SYohann 306ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 307ab213215SJeremy L Thompson // 2D transpose tensor contract and add x 308ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 309241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 310ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 31118d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 312241a4b83SYohann __syncthreads(); 31318d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 314ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 31518d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 316241a4b83SYohann __syncthreads(); 317241a4b83SYohann } 318241a4b83SYohann 319ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 320ab213215SJeremy L Thompson // 2D interpolate to quadrature points 321ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 322241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 323ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 324241a4b83SYohann CeedScalar r_t[1]; 325ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 326241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 327241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 328241a4b83SYohann } 329241a4b83SYohann } 330241a4b83SYohann 331ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 332ab213215SJeremy L Thompson // 2D interpolate transpose 333ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 334241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 335ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 336241a4b83SYohann CeedScalar r_t[1]; 337ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 338241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 339241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 340241a4b83SYohann } 341241a4b83SYohann } 342241a4b83SYohann 343ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 344ab213215SJeremy L Thompson // 2D derivatives at quadrature points 345ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 346241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 347ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 348241a4b83SYohann CeedScalar r_t[1]; 349ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 350241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 351241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 352241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 353241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 354241a4b83SYohann } 355241a4b83SYohann } 356241a4b83SYohann 357ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 358ab213215SJeremy L Thompson // 2D derivatives transpose 359ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 360241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 361ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 362241a4b83SYohann CeedScalar r_t[1]; 363ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 364241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 365241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 366241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 367241a4b83SYohann ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 368241a4b83SYohann } 369241a4b83SYohann } 370241a4b83SYohann 371ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 372241a4b83SYohann // 3D 373ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 374ab213215SJeremy L Thompson 375ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 376ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 377ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3785c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 379053543deSYohann Dudouit inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 380ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 381241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3824d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 383ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 384ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 3855c7b696cSjeremylt r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 386241a4b83SYohann } 387241a4b83SYohann } 388920dcdc4Sjeremylt 389ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 390ab213215SJeremy L Thompson // L-vector -> E-vector, strided 391ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 392d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 393053543deSYohann Dudouit inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 394ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 395ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 396ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 397d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 398ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 399d80fc06aSjeremylt r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 400ccedf6b0Sjeremylt } 401ccedf6b0Sjeremylt } 402241a4b83SYohann 403ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 404ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided 405ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4065c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d> 407053543deSYohann Dudouit inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 40818d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 409ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 410920dcdc4Sjeremylt const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 411ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4125c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 413ac421f39SYohann } 41418d499f1SYohann } 415ac421f39SYohann 416ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 417ab213215SJeremy L Thompson // E-vector -> Q-vector, strided 418ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 419d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 420053543deSYohann Dudouit inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 42118d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 422920dcdc4Sjeremylt const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 423d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 424ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 425d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 426920dcdc4Sjeremylt } 42718d499f1SYohann } 428920dcdc4Sjeremylt 429ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 430ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 431ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4325c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 433053543deSYohann Dudouit inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 434ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 435241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4364d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 437ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 438ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4395c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 440241a4b83SYohann } 441241a4b83SYohann } 442241a4b83SYohann 443ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 444ab213215SJeremy L Thompson // E-vector -> L-vector, strided 445ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 446d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4472f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 448ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 449ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 450ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 451d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 452ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 453d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 454ccedf6b0Sjeremylt } 455ccedf6b0Sjeremylt } 456ccedf6b0Sjeremylt 457ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 458ab213215SJeremy L Thompson // 3D tensor contract x 459ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 460241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 461ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 462ac421f39SYohann CeedScalar r_B[P1d]; 463ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 464ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 465ab213215SJeremy L Thompson 466ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 46718d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 468241a4b83SYohann __syncthreads(); 469241a4b83SYohann V[k] = 0.0; 47018d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 471ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 47218d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 473241a4b83SYohann __syncthreads(); 474241a4b83SYohann } 475241a4b83SYohann } 476241a4b83SYohann 477ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 478ab213215SJeremy L Thompson // 3D tensor contract y 479ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 480241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 481ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 482ac421f39SYohann CeedScalar r_B[P1d]; 483ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 484ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 485ab213215SJeremy L Thompson 486ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 48718d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 488241a4b83SYohann __syncthreads(); 489241a4b83SYohann V[k] = 0.0; 49018d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 491ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 49218d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 493241a4b83SYohann __syncthreads(); 494241a4b83SYohann } 495241a4b83SYohann } 496241a4b83SYohann 497ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 498ab213215SJeremy L Thompson // 3D tensor contract z 499ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 500241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 501ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 502ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 503241a4b83SYohann V[k] = 0.0; 50418d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 505ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 506ab213215SJeremy L Thompson V[k] += B[i + k*P1d] * U[i]; // Contract z direction 507241a4b83SYohann } 508241a4b83SYohann } 509241a4b83SYohann 510ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 511ab213215SJeremy L Thompson // 3D transpose tensor contract z 512ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 513241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 514ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 51518d499f1SYohann for (CeedInt k = 0; k < P1d; ++k) { 516241a4b83SYohann V[k] = 0.0; 51718d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 518ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 519ab213215SJeremy L Thompson V[k] += B[k + i*P1d] * U[i]; // Contract z direction 520241a4b83SYohann } 521241a4b83SYohann } 522241a4b83SYohann 523ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 524ab213215SJeremy L Thompson // 3D transpose tensor contract y 525ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 526241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 527ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 528ac421f39SYohann CeedScalar r_B[Q1d]; 529ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 530ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 531ab213215SJeremy L Thompson 532ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 53318d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 534241a4b83SYohann __syncthreads(); 535241a4b83SYohann V[k] = 0.0; 53618d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 537ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 53818d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 539ac421f39SYohann __syncthreads(); 540ac421f39SYohann } 541ac421f39SYohann } 542ac421f39SYohann 543ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 544ab213215SJeremy L Thompson // 3D transpose tensor contract add y 545ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 546ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 547ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 548ac421f39SYohann CeedScalar r_B[Q1d]; 549ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 550ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 551ab213215SJeremy L Thompson 552ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 55318d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 554ac421f39SYohann __syncthreads(); 55518d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 556ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 55718d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 558241a4b83SYohann __syncthreads(); 559241a4b83SYohann } 560241a4b83SYohann } 561241a4b83SYohann 562ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 563ab213215SJeremy L Thompson // 3D transpose tensor contract x 564ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 565241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 566ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 567ac421f39SYohann CeedScalar r_B[Q1d]; 568ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 569ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 570ab213215SJeremy L Thompson 571ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 57218d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 573241a4b83SYohann __syncthreads(); 574241a4b83SYohann V[k] = 0.0; 57518d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 576ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 57718d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 578241a4b83SYohann __syncthreads(); 579241a4b83SYohann } 580241a4b83SYohann } 581241a4b83SYohann 582ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 583ab213215SJeremy L Thompson // 3D transpose tensor contract add x 584ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 585241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 586ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 587ac421f39SYohann CeedScalar r_B[Q1d]; 588ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 589ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 590ab213215SJeremy L Thompson 591ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 59218d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 593241a4b83SYohann __syncthreads(); 59418d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 595ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 59618d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 597241a4b83SYohann __syncthreads(); 598241a4b83SYohann } 599241a4b83SYohann } 600241a4b83SYohann 601ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 602ab213215SJeremy L Thompson // 3D interpolate to quadrature points 603ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 604241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 605ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 60618d499f1SYohann CeedScalar r_t1[T1d]; 60718d499f1SYohann CeedScalar r_t2[T1d]; 608ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 609241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 610241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 611241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 612241a4b83SYohann } 613241a4b83SYohann } 614241a4b83SYohann 615ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 616ab213215SJeremy L Thompson // 3D interpolate transpose 617ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 618241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 619ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 62018d499f1SYohann CeedScalar r_t1[T1d]; 62118d499f1SYohann CeedScalar r_t2[T1d]; 622ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 623241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 624241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 625241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 626241a4b83SYohann } 627241a4b83SYohann } 628241a4b83SYohann 629ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 630ab213215SJeremy L Thompson // 3D derivatives at quadrature points 631ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 632241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 633ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 63418d499f1SYohann CeedScalar r_t1[T1d]; 63518d499f1SYohann CeedScalar r_t2[T1d]; 636ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 637241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 638241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 639241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 640241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 641241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 642241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 643241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 644241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 645241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 646241a4b83SYohann } 647241a4b83SYohann } 648241a4b83SYohann 649ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 650ab213215SJeremy L Thompson // 3D derivatives transpose 651ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 652241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 653ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 65418d499f1SYohann CeedScalar r_t1[T1d]; 65518d499f1SYohann CeedScalar r_t2[T1d]; 656ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 657241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 658241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 659241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 660241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 661241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 662241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 663241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 664241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 665241a4b83SYohann ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 666241a4b83SYohann } 667241a4b83SYohann } 668241a4b83SYohann 669ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 670ab213215SJeremy L Thompson // 3D collocated derivatives computation 671ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 672ac421f39SYohann template <int NCOMP, int Q1d> 673ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 67418d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 675ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 67618d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 677ac421f39SYohann __syncthreads(); 678ac421f39SYohann // X derivative 679ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 680ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 68118d499f1SYohann r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 682ac421f39SYohann // Y derivative 683ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 684ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 68518d499f1SYohann r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 686ac421f39SYohann // Z derivative 687ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 688ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 689ab213215SJeremy L Thompson r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 690ac421f39SYohann __syncthreads(); 691ac421f39SYohann } 692ac421f39SYohann } 69318d499f1SYohann } 694ac421f39SYohann 695ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 696ab213215SJeremy L Thompson // 3D collocated derivatives transpose 697ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 698ac421f39SYohann template <int NCOMP, int Q1d> 699ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 70018d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 701ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 702ac421f39SYohann // X derivative 70318d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 704ac421f39SYohann __syncthreads(); 705ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 70618d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 707ac421f39SYohann __syncthreads(); 708ac421f39SYohann // Y derivative 70918d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 710ac421f39SYohann __syncthreads(); 711ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 71218d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 713ac421f39SYohann __syncthreads(); 714ac421f39SYohann // Z derivative 715ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 716ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 717ac421f39SYohann } 718ac421f39SYohann } 71918d499f1SYohann } 720ac421f39SYohann 721ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 722ab213215SJeremy L Thompson // 1D quadrature weights 723ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 724241a4b83SYohann template <int Q1d> 725053543deSYohann Dudouit inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 72618d499f1SYohann *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 727241a4b83SYohann } 728241a4b83SYohann 729ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 730ab213215SJeremy L Thompson // 2D quadrature weights 731ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 732241a4b83SYohann template <int Q1d> 733053543deSYohann Dudouit inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 73418d499f1SYohann *w = (data.tidx < Q1d && data.tidy < Q1d) ? 73518d499f1SYohann qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 736241a4b83SYohann } 737241a4b83SYohann 738ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 739ab213215SJeremy L Thompson // 3D quadrature weights 740ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 741241a4b83SYohann template <int Q1d> 742053543deSYohann Dudouit inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 74318d499f1SYohann const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 74418d499f1SYohann const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 745ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 74618d499f1SYohann w[z] = quad ? pw*qweight1d[z] : 0.0; 747241a4b83SYohann } 748241a4b83SYohann 749241a4b83SYohann ); 750ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 751ab213215SJeremy L Thompson // Build singe operator kernel 752ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 753241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 754241a4b83SYohann 755241a4b83SYohann using std::ostringstream; 756241a4b83SYohann using std::string; 757241a4b83SYohann int ierr; 758241a4b83SYohann bool setupdone; 759e15f9bd0SJeremy L Thompson ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 760e15f9bd0SJeremy L Thompson if (setupdone) return CEED_ERROR_SUCCESS; 761241a4b83SYohann Ceed ceed; 762e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 763241a4b83SYohann CeedOperator_Cuda_gen *data; 764e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 765241a4b83SYohann CeedQFunction qf; 766241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 767e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 768e15f9bd0SJeremy L Thompson ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 769*e79b91d9SJeremy L Thompson CeedSize lsize; 77088db6d3bSJeremy L Thompson CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields, 771*e79b91d9SJeremy L Thompson numoutputfields, ncomp, dim = 0; 772e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 773e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 774241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 7757e7773b5SJeremy L Thompson ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields); 776e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 777241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 7787e7773b5SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 779e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 780241a4b83SYohann CeedEvalMode emode; 781241a4b83SYohann CeedBasis basis; 782241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 783241a4b83SYohann CeedElemRestriction Erestrict; 784461525f5SNatalie Beams CeedElemRestriction_Cuda *restr_data; 785241a4b83SYohann 7860b454692Sjeremylt // Check for restriction only identity operator 7870b454692Sjeremylt bool is_identity_qf; 7880b454692Sjeremylt ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 7890b454692Sjeremylt if (is_identity_qf) { 7900b454692Sjeremylt CeedEvalMode emodein, emodeout; 7910b454692Sjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein); CeedChkBackend(ierr); 7920b454692Sjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout); CeedChkBackend(ierr); 7930b454692Sjeremylt if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE) 7940b454692Sjeremylt // LCOV_EXCL_START 7950b454692Sjeremylt return CeedError(ceed, CEED_ERROR_BACKEND, 7960b454692Sjeremylt "Backend does not implement restriction only identity operators"); 7970b454692Sjeremylt // LCOV_EXCL_STOP 7980b454692Sjeremylt } 7990b454692Sjeremylt 800241a4b83SYohann ostringstream code; 801241a4b83SYohann string devFunctions(deviceFunctions); 802241a4b83SYohann 803f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 804f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 805f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 80688db6d3bSJeremy L Thompson ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr); 8070d0321e0SJeremy L Thompson ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr); 80880a9ef05SNatalie Beams if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){ 809f1a13f77SYohann Dudouit code << atomicAdd; 810f1a13f77SYohann Dudouit } 811f1a13f77SYohann Dudouit 812241a4b83SYohann code << devFunctions; 813241a4b83SYohann 814241a4b83SYohann string qFunction(qf_data->qFunctionSource); 815c3c443faSYohann Dudouit string qFunctionName(qf_data->qFunctionName); 816c3c443faSYohann Dudouit string oper; 81714cce66cSYohann Dudouit oper = "CeedKernel_Cuda_gen_" + qFunctionName; 8184d537eeaSYohann 8194d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 82077841947SLeila Ghaffari code << "#define CEED_QFUNCTION_HELPER inline __device__\n"; 821e15f9bd0SJeremy L Thompson code << "#define CeedPragmaSIMD\n"; 822e15f9bd0SJeremy L Thompson code << "#define CEED_ERROR_SUCCESS 0\n\n"; 8231da99368SJeremy L Thompson 8241da99368SJeremy L Thompson // Find dim and Q1d 82564d3f0c0Sjeremylt bool useCollograd = true; 82618d499f1SYohann data->maxP1d = 0; 8271da99368SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 828e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 8291da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 830e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 8310f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 832e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8331da99368SJeremy L Thompson 8341da99368SJeremy L Thompson // Check for collocated gradient 835437930d1SJeremy L Thompson useCollograd = useCollograd && basis_data->d_collo_grad_1d; 8361da99368SJeremy L Thompson 8371da99368SJeremy L Thompson // Collect dim and Q1d 838e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 8391da99368SJeremy L Thompson bool isTensor; 840e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 8411da99368SJeremy L Thompson if (isTensor) { 842e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 843e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 84418d499f1SYohann if (P1d>data->maxP1d) data->maxP1d = P1d; 8451da99368SJeremy L Thompson } else { 846e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 847e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 848e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8491da99368SJeremy L Thompson } 8501da99368SJeremy L Thompson } 8511da99368SJeremy L Thompson } 8521da99368SJeremy L Thompson // Check output bases for Q1d, dim as well 8531da99368SJeremy L Thompson // The only imput basis might be CEED_BASIS_COLLOCATED 8541da99368SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 855e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 8560f54b25eSjeremylt 8571da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 858e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 8590f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 860e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8610f54b25eSjeremylt 8621da99368SJeremy L Thompson // Collect dim and Q1d 863e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 8641da99368SJeremy L Thompson bool isTensor; 865e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 8661da99368SJeremy L Thompson if (isTensor) { 867e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 8681da99368SJeremy L Thompson } else { 869e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 870e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 871e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8721da99368SJeremy L Thompson } 8730f54b25eSjeremylt 8740f54b25eSjeremylt // Check for collocated gradient 875437930d1SJeremy L Thompson useCollograd = useCollograd && basis_data->d_collo_grad_1d; 8761da99368SJeremy L Thompson } 8771da99368SJeremy L Thompson } 8781da99368SJeremy L Thompson data->dim = dim; 8791da99368SJeremy L Thompson data->Q1d = Q1d; 8801da99368SJeremy L Thompson 8811da99368SJeremy L Thompson // Define CEED_Q_VLA 88264d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 8831da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA 1\n\n"; 8841da99368SJeremy L Thompson } else { 8851da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 8861da99368SJeremy L Thompson } 8871da99368SJeremy L Thompson 888241a4b83SYohann code << qFunction; 889241a4b83SYohann 890241a4b83SYohann // Setup 891818e0025SJeremy L Thompson code << "\n// -----------------------------------------------------------------------------\n"; 892c3c443faSYohann Dudouit code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 893241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 894241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 895e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8961da99368SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 897241a4b83SYohann code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 898241a4b83SYohann } 899241a4b83SYohann } 900241a4b83SYohann 901241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 902241a4b83SYohann code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 903241a4b83SYohann } 9041da99368SJeremy L Thompson 905241a4b83SYohann code << " const CeedInt Dim = "<<dim<<";\n"; 906241a4b83SYohann code << " const CeedInt Q1d = "<<Q1d<<";\n"; 9071da99368SJeremy L Thompson 908241a4b83SYohann code << " extern __shared__ CeedScalar slice[];\n"; 909241a4b83SYohann code << " BackendData data;\n"; 910241a4b83SYohann code << " data.tidx = threadIdx.x;\n"; 911241a4b83SYohann code << " data.tidy = threadIdx.y;\n"; 912241a4b83SYohann code << " data.tidz = threadIdx.z;\n"; 913241a4b83SYohann code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 91418d499f1SYohann code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 915920dcdc4Sjeremylt 916818e0025SJeremy L Thompson code << "\n // -- Input field constants and basis data --\n"; 917ac421f39SYohann //Initialize constants, and matrices B and G 918241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 919818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 920241a4b83SYohann // Get elemsize, emode, ncomp 921241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 922e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 923241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 924e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 925241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 926e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9274d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 928e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 929920dcdc4Sjeremylt 930920dcdc4Sjeremylt // Set field constants 931920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT) { 932e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 933920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 934e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 935920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 936920dcdc4Sjeremylt } else { 937920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 938920dcdc4Sjeremylt } 939920dcdc4Sjeremylt code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 940920dcdc4Sjeremylt } 941920dcdc4Sjeremylt 942920dcdc4Sjeremylt // Load basis data 943920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 944241a4b83SYohann switch (emode) { 945241a4b83SYohann case CEED_EVAL_NONE: 946241a4b83SYohann break; 947241a4b83SYohann case CEED_EVAL_INTERP: 948e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 949437930d1SJeremy L Thompson data->B.in[i] = basis_data->d_interp_1d; 95080a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 951241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 952241a4b83SYohann break; 953241a4b83SYohann case CEED_EVAL_GRAD: 954e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 955437930d1SJeremy L Thompson data->B.in[i] = basis_data->d_interp_1d; 95680a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 957241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 95864d3f0c0Sjeremylt if (useCollograd) { 959437930d1SJeremy L Thompson data->G.in[i] = basis_data->d_collo_grad_1d; 96080a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 961ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 962ac421f39SYohann } else { 963437930d1SJeremy L Thompson data->G.in[i] = basis_data->d_grad_1d; 96480a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 965241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 966ac421f39SYohann } 967241a4b83SYohann break; 968241a4b83SYohann case CEED_EVAL_WEIGHT: 969241a4b83SYohann break; // No action 970241a4b83SYohann case CEED_EVAL_DIV: 971241a4b83SYohann break; // TODO: Not implemented 972241a4b83SYohann case CEED_EVAL_CURL: 973241a4b83SYohann break; // TODO: Not implemented 974241a4b83SYohann } 975241a4b83SYohann } 976920dcdc4Sjeremylt 977818e0025SJeremy L Thompson code << "\n // -- Output field constants and basis data --\n"; 978241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 979818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 980241a4b83SYohann // Get elemsize, emode, ncomp 981241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 982e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 983241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 984e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 985241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 986e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9874d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 988e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 989920dcdc4Sjeremylt 990920dcdc4Sjeremylt // Set field constants 991e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 992920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 993e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 994241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 995920dcdc4Sjeremylt } else { 996920dcdc4Sjeremylt code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 997920dcdc4Sjeremylt } 998920dcdc4Sjeremylt code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 999920dcdc4Sjeremylt 1000920dcdc4Sjeremylt // Load basis data 1001920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1002920dcdc4Sjeremylt switch (emode) { 1003920dcdc4Sjeremylt case CEED_EVAL_NONE: 1004920dcdc4Sjeremylt break; // No action 1005920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1006e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1007437930d1SJeremy L Thompson data->B.out[i] = basis_data->d_interp_1d; 100880a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1009241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1010241a4b83SYohann break; 1011241a4b83SYohann case CEED_EVAL_GRAD: 1012e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1013437930d1SJeremy L Thompson data->B.out[i] = basis_data->d_interp_1d; 101480a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1015241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 101664d3f0c0Sjeremylt if (useCollograd) { 1017437930d1SJeremy L Thompson data->G.out[i] = basis_data->d_collo_grad_1d; 101880a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1019ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1020ac421f39SYohann } else { 1021437930d1SJeremy L Thompson data->G.out[i] = basis_data->d_grad_1d; 102280a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1023241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1024ac421f39SYohann } 1025241a4b83SYohann break; 1026e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1027241a4b83SYohann case CEED_EVAL_WEIGHT: { 1028241a4b83SYohann Ceed ceed; 1029e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1030e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1031241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1032241a4b83SYohann break; // Should not occur 1033241a4b83SYohann } 1034241a4b83SYohann case CEED_EVAL_DIV: 1035241a4b83SYohann break; // TODO: Not implemented 1036241a4b83SYohann case CEED_EVAL_CURL: 1037241a4b83SYohann break; // TODO: Not implemented 1038e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1039241a4b83SYohann } 1040241a4b83SYohann } 1041818e0025SJeremy L Thompson code << "\n // -- Element loop --\n"; 1042ac421f39SYohann code << " __syncthreads();\n"; 1043241a4b83SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1044241a4b83SYohann // Input basis apply if needed 1045ac421f39SYohann // Generate the correct eval mode code for each input 1046818e0025SJeremy L Thompson code << " // -- Input field restrictions and basis actions --\n"; 1047241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1048818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1049241a4b83SYohann // Get elemsize, emode, ncomp 1050241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1051e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1052241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1053e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1054241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1055e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10564d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1057e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1058920dcdc4Sjeremylt 1059920dcdc4Sjeremylt // Restriction 1060920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT && 106164d3f0c0Sjeremylt !((emode == CEED_EVAL_NONE) && useCollograd)) { 1062241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 10639a2291e3SJeremy L Thompson 10649a2291e3SJeremy L Thompson bool isStrided; 1065e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 10669a2291e3SJeremy L Thompson if (!isStrided) { 10675c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1068e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10695c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 10705c7b696cSjeremylt CeedInt compstride; 1071e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 10725c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1073e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 10749a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 10755c7b696cSjeremylt code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1076ccedf6b0Sjeremylt } else { 1077920dcdc4Sjeremylt bool backendstrides; 1078fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1079e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 108013585805SJeremy L Thompson CeedInt nelem; 108113585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1082e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 108313585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1084920dcdc4Sjeremylt if (!backendstrides) { 1085920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1086e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1087ccedf6b0Sjeremylt } 1088920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1089d80fc06aSjeremylt code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1090920dcdc4Sjeremylt } 1091920dcdc4Sjeremylt } 1092920dcdc4Sjeremylt 1093920dcdc4Sjeremylt // Basis action 1094920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1095920dcdc4Sjeremylt switch (emode) { 1096920dcdc4Sjeremylt case CEED_EVAL_NONE: 109764d3f0c0Sjeremylt if (!useCollograd) { 1098920dcdc4Sjeremylt code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1099920dcdc4Sjeremylt } 1100920dcdc4Sjeremylt break; 1101920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1102241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1103241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1104241a4b83SYohann break; 1105241a4b83SYohann case CEED_EVAL_GRAD: 110664d3f0c0Sjeremylt if (useCollograd) { 1107ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1108ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1109ac421f39SYohann } else { 1110241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1111241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1112ac421f39SYohann } 1113241a4b83SYohann break; 1114241a4b83SYohann case CEED_EVAL_WEIGHT: 1115241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1116e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 1117e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1118437930d1SJeremy L Thompson data->W = basis_data->d_q_weight_1d; 1119241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1120241a4b83SYohann break; // No action 1121241a4b83SYohann case CEED_EVAL_DIV: 1122241a4b83SYohann break; // TODO: Not implemented 1123241a4b83SYohann case CEED_EVAL_CURL: 1124241a4b83SYohann break; // TODO: Not implemented 1125241a4b83SYohann } 1126241a4b83SYohann } 1127ac421f39SYohann 1128241a4b83SYohann // Q function 1129818e0025SJeremy L Thompson code << "\n // -- Output field setup --\n"; 1130241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1131818e0025SJeremy L Thompson code << "\n // ---- Output field "<<i<<" ----\n"; 1132241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1133e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1134241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1135241a4b83SYohann { 113664d3f0c0Sjeremylt if (useCollograd) { 1137ac421f39SYohann //Accumulator for gradient slices 1138ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1139ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1140ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1141ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1142ac421f39SYohann code << " }\n"; 1143ac421f39SYohann code << " }\n"; 1144ac421f39SYohann } else { 1145241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1146241a4b83SYohann } 1147ac421f39SYohann } 1148241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1149241a4b83SYohann { 1150241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1151241a4b83SYohann } 1152241a4b83SYohann } 1153ac421f39SYohann // We treat quadrature points per slice in 3d to save registers 115464d3f0c0Sjeremylt if (useCollograd) { 1155920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1156ac421f39SYohann code << "#pragma unroll\n"; 1157ac421f39SYohann code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1158818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1159ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1160818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1161ac421f39SYohann // Get elemsize, emode, ncomp 1162ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1163e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1164ac421f39SYohann // Basis action 1165920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1166ac421f39SYohann switch (emode) { 1167ac421f39SYohann case CEED_EVAL_NONE: 1168ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 11699a2291e3SJeremy L Thompson 11709a2291e3SJeremy L Thompson bool isStrided; 1171e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr); 1172e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr); 1173e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 11749a2291e3SJeremy L Thompson if (!isStrided) { 11755c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1176e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 11775c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 11785c7b696cSjeremylt CeedInt compstride; 1179e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 11805c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1181e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 11829a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 11835c7b696cSjeremylt code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1184920dcdc4Sjeremylt } else { 1185920dcdc4Sjeremylt bool backendstrides; 1186fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1187e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 118813585805SJeremy L Thompson CeedInt nelem; 118913585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1190e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 119113585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1192920dcdc4Sjeremylt if (!backendstrides) { 1193920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1194e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1195920dcdc4Sjeremylt } 1196920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1197d80fc06aSjeremylt code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1198920dcdc4Sjeremylt } 1199ac421f39SYohann break; 1200ac421f39SYohann case CEED_EVAL_INTERP: 1201ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1202ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1203ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1204ac421f39SYohann code << " }\n"; 1205ac421f39SYohann break; 1206ac421f39SYohann case CEED_EVAL_GRAD: 1207ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1208ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1209ac421f39SYohann break; 1210ac421f39SYohann case CEED_EVAL_WEIGHT: 1211ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1212ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1213ac421f39SYohann break; // No action 1214ac421f39SYohann case CEED_EVAL_DIV: 1215ac421f39SYohann break; // TODO: Not implemented 1216ac421f39SYohann case CEED_EVAL_CURL: 1217ac421f39SYohann break; // TODO: Not implemented 1218ac421f39SYohann } 1219ac421f39SYohann } 1220818e0025SJeremy L Thompson code << "\n // -- Output fields --\n"; 1221ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1222818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1223ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1224e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1225ac421f39SYohann // Basis action 1226ac421f39SYohann switch (emode) { 1227ac421f39SYohann case CEED_EVAL_NONE: 1228ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1229ac421f39SYohann break; // No action 1230ac421f39SYohann case CEED_EVAL_INTERP: 1231ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1232ac421f39SYohann break; 1233ac421f39SYohann case CEED_EVAL_GRAD: 1234ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1235ac421f39SYohann break; 1236ac421f39SYohann case CEED_EVAL_WEIGHT: 1237ac421f39SYohann break; // Should not occur 1238ac421f39SYohann case CEED_EVAL_DIV: 1239ac421f39SYohann break; // TODO: Not implemented 1240ac421f39SYohann case CEED_EVAL_CURL: 1241ac421f39SYohann break; // TODO: Not implemented 1242ac421f39SYohann } 1243ac421f39SYohann } 1244ac421f39SYohann } else { 1245920dcdc4Sjeremylt code << "\n // Note: No Collocated Gradient\n"; 1246818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1247ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1248818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1249ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1250ac421f39SYohann } 1251818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1252ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1253818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1254ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1255ac421f39SYohann } 1256ac421f39SYohann } 1257818e0025SJeremy L Thompson code << "\n // -- QFunction Inputs and outputs --\n"; 12584d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 12594d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1260818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1261ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 12624d537eeaSYohann } 12634d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 12644d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1265818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1266ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 12674d537eeaSYohann } 1268818e0025SJeremy L Thompson code << "\n // -- Apply QFunction --\n"; 1269ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 127064d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 1271ac421f39SYohann code << "1"; 1272ac421f39SYohann } else { 1273ac421f39SYohann code << "Q1d"; 1274ac421f39SYohann } 1275ac421f39SYohann code << ", in, out);\n"; 127664d3f0c0Sjeremylt if (useCollograd) { 1277920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1278818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1279ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1280818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1281ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1282e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1283ac421f39SYohann // Basis action 1284920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1285ac421f39SYohann switch (emode) { 1286ac421f39SYohann case CEED_EVAL_NONE: 1287ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1288ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1289ac421f39SYohann code << " }\n"; 1290ac421f39SYohann break; // No action 1291ac421f39SYohann case CEED_EVAL_INTERP: 1292ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1293ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1294ac421f39SYohann code << " }\n"; 1295ac421f39SYohann break; 1296ac421f39SYohann case CEED_EVAL_GRAD: 1297ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1298ac421f39SYohann break; 1299ac421f39SYohann case CEED_EVAL_WEIGHT: 1300ac421f39SYohann break; // Should not occur 1301ac421f39SYohann case CEED_EVAL_DIV: 1302ac421f39SYohann break; // TODO: Not implemented 1303ac421f39SYohann case CEED_EVAL_CURL: 1304ac421f39SYohann break; // TODO: Not implemented 1305ac421f39SYohann } 1306ac421f39SYohann } 1307ac421f39SYohann code << " }\n"; 1308ac421f39SYohann } 1309241a4b83SYohann 1310241a4b83SYohann // Output basis apply if needed 1311ac421f39SYohann // Generate the correct eval mode code for each output 1312818e0025SJeremy L Thompson code << "\n // -- Output field basis action and restrictions --\n"; 1313241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1314818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1315241a4b83SYohann // Get elemsize, emode, ncomp 1316241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1317e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1318241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1319e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1320241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1321e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13224d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1323e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1324241a4b83SYohann // Basis action 1325920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1326241a4b83SYohann switch (emode) { 1327241a4b83SYohann case CEED_EVAL_NONE: 1328920dcdc4Sjeremylt code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1329241a4b83SYohann break; // No action 1330241a4b83SYohann case CEED_EVAL_INTERP: 1331241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1332241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1333241a4b83SYohann break; 1334241a4b83SYohann case CEED_EVAL_GRAD: 1335241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 133664d3f0c0Sjeremylt if (useCollograd) { 1337ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1338ac421f39SYohann } else { 1339241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1340ac421f39SYohann } 1341241a4b83SYohann break; 1342e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1343241a4b83SYohann case CEED_EVAL_WEIGHT: { 1344241a4b83SYohann Ceed ceed; 1345e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1346e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1347241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1348241a4b83SYohann break; // Should not occur 1349241a4b83SYohann } 1350241a4b83SYohann case CEED_EVAL_DIV: 1351241a4b83SYohann break; // TODO: Not implemented 1352241a4b83SYohann case CEED_EVAL_CURL: 1353241a4b83SYohann break; // TODO: Not implemented 1354e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1355241a4b83SYohann } 1356920dcdc4Sjeremylt // Restriction 13579a2291e3SJeremy L Thompson bool isStrided; 1358e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 13599a2291e3SJeremy L Thompson if (!isStrided) { 13605c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1361e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13625c7b696cSjeremylt code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 13635c7b696cSjeremylt CeedInt compstride; 1364e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 13655c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1366e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 13679a2291e3SJeremy L Thompson data->indices.out[i] = restr_data->d_ind; 13685c7b696cSjeremylt code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1369920dcdc4Sjeremylt } else { 1370920dcdc4Sjeremylt bool backendstrides; 1371fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1372e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 137313585805SJeremy L Thompson CeedInt nelem; 137413585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1375e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 137613585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1377920dcdc4Sjeremylt if (!backendstrides) { 1378920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1379e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1380920dcdc4Sjeremylt } 1381920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1382d80fc06aSjeremylt code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1383920dcdc4Sjeremylt } 1384241a4b83SYohann } 1385241a4b83SYohann 1386241a4b83SYohann code << " }\n"; 1387818e0025SJeremy L Thompson code << "}\n"; 1388818e0025SJeremy L Thompson code << "// -----------------------------------------------------------------------------\n\n"; 1389241a4b83SYohann 1390ab213215SJeremy L Thompson // View kernel for debugging 139146dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 13923f21f6b1SJeremy L Thompson CeedDebug(ceed, code.str().c_str()); 1393241a4b83SYohann 139418d499f1SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 139518d499f1SYohann "T1d", CeedIntMax(Q1d, data->maxP1d)); 1396e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1397c3c443faSYohann Dudouit ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1398e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1399241a4b83SYohann 1400e15f9bd0SJeremy L Thompson ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 1401e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1402241a4b83SYohann } 1403ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1404