1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16241a4b83SYohann #include <ceed-backend.h> 17241a4b83SYohann #include "ceed-cuda-gen.h" 18241a4b83SYohann #include <iostream> 19241a4b83SYohann #include <sstream> 20241a4b83SYohann #include "../cuda/ceed-cuda.h" 21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h" 22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 23241a4b83SYohann 24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 25241a4b83SYohann __device__ double atomicAdd(double *address, double val) { 26241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 27241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 28241a4b83SYohann do { 29241a4b83SYohann assumed = old; 30241a4b83SYohann old = 31241a4b83SYohann atomicCAS(address_as_ull, assumed, 32241a4b83SYohann __double_as_longlong(val + 33241a4b83SYohann __longlong_as_double(assumed))); 34241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 35241a4b83SYohann // (since NaN != NaN) 36241a4b83SYohann } while (assumed != old); 37241a4b83SYohann return __longlong_as_double(old); 38241a4b83SYohann } 39f1a13f77SYohann Dudouit ); 40f1a13f77SYohann Dudouit 41f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 42f1a13f77SYohann Dudouit 43f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45f1a13f77SYohann Dudouit 46f1a13f77SYohann Dudouit typedef struct { 47f1a13f77SYohann Dudouit CeedInt tidx; 48f1a13f77SYohann Dudouit CeedInt tidy; 49f1a13f77SYohann Dudouit CeedInt tidz; 50f1a13f77SYohann Dudouit CeedInt tid; 51f1a13f77SYohann Dudouit CeedScalar* slice; 52f1a13f77SYohann Dudouit } BackendData; 53241a4b83SYohann 54241a4b83SYohann template <int P, int Q> 55241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56ac421f39SYohann for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57241a4b83SYohann B[i] = d_B[i]; 58241a4b83SYohann } 59241a4b83SYohann } 60241a4b83SYohann 61241a4b83SYohann //**** 62241a4b83SYohann // 1D 63*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 64*5c7b696cSjeremylt inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 65241a4b83SYohann if (data.tidx<P1d) 66241a4b83SYohann { 674d537eeaSYohann const CeedInt node = data.tidx; 68ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 69241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 70*5c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 71241a4b83SYohann } 72241a4b83SYohann } 73241a4b83SYohann } 74920dcdc4Sjeremylt 75d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 76d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 77ccedf6b0Sjeremylt if (data.tidx<P1d) 78ccedf6b0Sjeremylt { 79ccedf6b0Sjeremylt const CeedInt node = data.tidx; 80d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 81ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 82d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 83ccedf6b0Sjeremylt } 84ccedf6b0Sjeremylt } 85ccedf6b0Sjeremylt } 86241a4b83SYohann 87*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 88*5c7b696cSjeremylt inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 89241a4b83SYohann if (data.tidx<P1d) 90241a4b83SYohann { 914d537eeaSYohann const CeedInt node = data.tidx; 92ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 93241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 94*5c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 95241a4b83SYohann } 96241a4b83SYohann } 97241a4b83SYohann } 98241a4b83SYohann 99d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 100d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 101ccedf6b0Sjeremylt if (data.tidx<P1d) 102ccedf6b0Sjeremylt { 103ccedf6b0Sjeremylt const CeedInt node = data.tidx; 104d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 105ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 106d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 107ccedf6b0Sjeremylt } 108ccedf6b0Sjeremylt } 109ccedf6b0Sjeremylt } 110ccedf6b0Sjeremylt 111241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 112241a4b83SYohann inline __device__ void ContractX1d(BackendData& data, 113241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 114241a4b83SYohann data.slice[data.tidx] = *U; 115241a4b83SYohann __syncthreads(); 116241a4b83SYohann *V = 0.0; 117ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 118241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 119241a4b83SYohann } 120241a4b83SYohann __syncthreads(); 121241a4b83SYohann } 122241a4b83SYohann 123241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 124241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data, 125241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 126241a4b83SYohann data.slice[data.tidx] = *U; 127241a4b83SYohann __syncthreads(); 128241a4b83SYohann *V = 0.0; 129ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 130241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 131241a4b83SYohann } 132241a4b83SYohann __syncthreads(); 133241a4b83SYohann } 134241a4b83SYohann 135241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 136241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 137241a4b83SYohann CeedScalar *__restrict__ r_V) { 138ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 139241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 140241a4b83SYohann } 141241a4b83SYohann } 142241a4b83SYohann 143241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 144241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 145241a4b83SYohann CeedScalar *__restrict__ r_V) { 146ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 147241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 148241a4b83SYohann } 149241a4b83SYohann } 150241a4b83SYohann 151241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 152241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 153241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 154241a4b83SYohann CeedScalar *__restrict__ r_V) { 155ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 156241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 157241a4b83SYohann } 158241a4b83SYohann } 159241a4b83SYohann 160241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 161241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 162241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 163241a4b83SYohann CeedScalar *__restrict__ r_V) { 164ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 165241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 166241a4b83SYohann } 167241a4b83SYohann } 168241a4b83SYohann 169241a4b83SYohann //**** 170241a4b83SYohann // 2D 171*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 172*5c7b696cSjeremylt inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 173241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 174241a4b83SYohann { 1754d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 176ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 177241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 178*5c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 179241a4b83SYohann } 180241a4b83SYohann } 181241a4b83SYohann } 182920dcdc4Sjeremylt 183d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 184d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 185ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) 186ccedf6b0Sjeremylt { 187ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 188d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 189ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 190d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 191ccedf6b0Sjeremylt } 192ccedf6b0Sjeremylt } 193ccedf6b0Sjeremylt } 194241a4b83SYohann 195*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 196*5c7b696cSjeremylt inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 197241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 198241a4b83SYohann { 1994d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 200ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 201241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 202*5c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 203241a4b83SYohann } 204241a4b83SYohann } 205241a4b83SYohann } 206241a4b83SYohann 207d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 208d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 209ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) 210ccedf6b0Sjeremylt { 211ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 212d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 213ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 214d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 215ccedf6b0Sjeremylt } 216ccedf6b0Sjeremylt } 217ccedf6b0Sjeremylt } 218ccedf6b0Sjeremylt 219241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 220241a4b83SYohann inline __device__ void ContractX2d(BackendData& data, 221241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 222241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 223241a4b83SYohann __syncthreads(); 224241a4b83SYohann *V = 0.0; 225ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 226241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 227241a4b83SYohann } 228241a4b83SYohann __syncthreads(); 229241a4b83SYohann } 230241a4b83SYohann 231241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 232241a4b83SYohann inline __device__ void ContractY2d(BackendData& data, 233241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 234241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 235241a4b83SYohann __syncthreads(); 236241a4b83SYohann *V = 0.0; 237ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 238241a4b83SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 239241a4b83SYohann } 240241a4b83SYohann __syncthreads(); 241241a4b83SYohann } 242241a4b83SYohann 243241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 244241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data, 245241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 246241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 247241a4b83SYohann __syncthreads(); 248241a4b83SYohann *V = 0.0; 249241a4b83SYohann if (data.tidy<P1d) { 250ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 251241a4b83SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 252241a4b83SYohann } 253241a4b83SYohann } 254241a4b83SYohann __syncthreads(); 255241a4b83SYohann } 256241a4b83SYohann 257241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 258241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data, 259241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 260241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 261241a4b83SYohann __syncthreads(); 262241a4b83SYohann *V = 0.0; 263241a4b83SYohann if (data.tidx<P1d) { 264ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 265241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 266241a4b83SYohann } 267241a4b83SYohann } 268241a4b83SYohann __syncthreads(); 269241a4b83SYohann } 270241a4b83SYohann 271241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 272241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data, 273241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 274241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 275241a4b83SYohann __syncthreads(); 276241a4b83SYohann if (data.tidx<P1d) { 277ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 278241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 279241a4b83SYohann } 280241a4b83SYohann } 281241a4b83SYohann __syncthreads(); 282241a4b83SYohann } 283241a4b83SYohann 284241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 285241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 286241a4b83SYohann CeedScalar *__restrict__ r_V) { 287241a4b83SYohann CeedScalar r_t[1]; 288ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 289241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 290241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 291241a4b83SYohann } 292241a4b83SYohann } 293241a4b83SYohann 294241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 295241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 296241a4b83SYohann CeedScalar *__restrict__ r_V) { 297241a4b83SYohann CeedScalar r_t[1]; 298ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 299241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 300241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 301241a4b83SYohann } 302241a4b83SYohann } 303241a4b83SYohann 304241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 305241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 306241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 307241a4b83SYohann CeedScalar *__restrict__ r_V) { 308241a4b83SYohann CeedScalar r_t[1]; 309ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 310241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 311241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 312241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 313241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 314241a4b83SYohann } 315241a4b83SYohann } 316241a4b83SYohann 317241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 318241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 319241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 320241a4b83SYohann CeedScalar *__restrict__ r_V) { 321241a4b83SYohann CeedScalar r_t[1]; 322ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 323241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 324241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 325241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 326241a4b83SYohann ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 327241a4b83SYohann } 328241a4b83SYohann } 329241a4b83SYohann 330241a4b83SYohann //**** 331241a4b83SYohann // 3D 332*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 333*5c7b696cSjeremylt inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 334241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 335241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3364d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 337ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 338241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 339*5c7b696cSjeremylt r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 340241a4b83SYohann } 341241a4b83SYohann } 342241a4b83SYohann } 343241a4b83SYohann } 344920dcdc4Sjeremylt 345d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 346d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 347ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) { 348ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 349ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 350d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 351ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 352d80fc06aSjeremylt r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 353ccedf6b0Sjeremylt } 354ccedf6b0Sjeremylt } 355ccedf6b0Sjeremylt } 356ccedf6b0Sjeremylt } 357241a4b83SYohann 358*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d> 359*5c7b696cSjeremylt inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 360ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 361920dcdc4Sjeremylt const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 362ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 363*5c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 364ac421f39SYohann } 365ac421f39SYohann } 366ac421f39SYohann 367d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 36825dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 369920dcdc4Sjeremylt const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 370d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 371920dcdc4Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 372d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 373920dcdc4Sjeremylt } 374920dcdc4Sjeremylt } 375920dcdc4Sjeremylt 376*5c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 377*5c7b696cSjeremylt inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 378241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 379241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3804d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 381ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 382241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 383*5c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 384241a4b83SYohann } 385241a4b83SYohann } 386241a4b83SYohann } 387241a4b83SYohann } 388241a4b83SYohann 389d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 3902f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 391ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) { 392ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 393ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 394d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 395ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 396d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 397ccedf6b0Sjeremylt } 398ccedf6b0Sjeremylt } 399ccedf6b0Sjeremylt } 400ccedf6b0Sjeremylt } 401ccedf6b0Sjeremylt 402241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 403241a4b83SYohann inline __device__ void ContractX3d(BackendData& data, 404241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 405ac421f39SYohann CeedScalar r_B[P1d]; 406ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 407ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 408ac421f39SYohann } 409ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 410241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 411241a4b83SYohann __syncthreads(); 412241a4b83SYohann V[k] = 0.0; 413ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 414ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 415241a4b83SYohann } 416241a4b83SYohann __syncthreads(); 417241a4b83SYohann } 418241a4b83SYohann } 419241a4b83SYohann 420241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 421241a4b83SYohann inline __device__ void ContractY3d(BackendData& data, 422241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 423ac421f39SYohann CeedScalar r_B[P1d]; 424ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 425ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 426ac421f39SYohann } 427ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 428241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 429241a4b83SYohann __syncthreads(); 430241a4b83SYohann V[k] = 0.0; 431ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 432ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 433241a4b83SYohann } 434241a4b83SYohann __syncthreads(); 435241a4b83SYohann } 436241a4b83SYohann } 437241a4b83SYohann 438241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 439241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data, 440241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 441ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 442241a4b83SYohann V[k] = 0.0; 443ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 444241a4b83SYohann V[k] += B[i + k*P1d] * U[i];//contract z direction 445241a4b83SYohann } 446241a4b83SYohann } 447241a4b83SYohann } 448241a4b83SYohann 449241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 450241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data, 451241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 452ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 453241a4b83SYohann V[k] = 0.0; 454241a4b83SYohann if (k<P1d) { 455ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 456241a4b83SYohann V[k] += B[k + i*P1d] * U[i];//contract z direction 457241a4b83SYohann } 458241a4b83SYohann } 459241a4b83SYohann } 460241a4b83SYohann } 461241a4b83SYohann 462241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 463241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data, 464241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 465ac421f39SYohann CeedScalar r_B[Q1d]; 466ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 467ac421f39SYohann { 468ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 469ac421f39SYohann } 470ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 471241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 472241a4b83SYohann __syncthreads(); 473241a4b83SYohann V[k] = 0.0; 474241a4b83SYohann if (data.tidy<P1d) { 475ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 476ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 477ac421f39SYohann } 478ac421f39SYohann } 479ac421f39SYohann __syncthreads(); 480ac421f39SYohann } 481ac421f39SYohann } 482ac421f39SYohann 483ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 484ac421f39SYohann inline __device__ void ContractTransposeAddY3d(BackendData& data, 485ac421f39SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 486ac421f39SYohann CeedScalar r_B[Q1d]; 487ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 488ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 489ac421f39SYohann } 490ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 491ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 492ac421f39SYohann __syncthreads(); 493ac421f39SYohann if (data.tidy<P1d) { 494ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 495ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 496241a4b83SYohann } 497241a4b83SYohann } 498241a4b83SYohann __syncthreads(); 499241a4b83SYohann } 500241a4b83SYohann } 501241a4b83SYohann 502241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 503241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data, 504241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 505ac421f39SYohann CeedScalar r_B[Q1d]; 506ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 507ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 508ac421f39SYohann } 509ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 510241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 511241a4b83SYohann __syncthreads(); 512241a4b83SYohann V[k] = 0.0; 513241a4b83SYohann if (data.tidx<P1d) { 514ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 515ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 516241a4b83SYohann } 517241a4b83SYohann } 518241a4b83SYohann __syncthreads(); 519241a4b83SYohann } 520241a4b83SYohann } 521241a4b83SYohann 522241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 523241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data, 524241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 525ac421f39SYohann CeedScalar r_B[Q1d]; 526ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 527ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 528ac421f39SYohann } 529ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 530241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 531241a4b83SYohann __syncthreads(); 532241a4b83SYohann if (data.tidx<P1d) { 533ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 534ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 535241a4b83SYohann } 536241a4b83SYohann } 537241a4b83SYohann __syncthreads(); 538241a4b83SYohann } 539241a4b83SYohann } 540241a4b83SYohann 541241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 542241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 543241a4b83SYohann CeedScalar *__restrict__ r_V) { 544241a4b83SYohann CeedScalar r_t1[Q1d]; 545241a4b83SYohann CeedScalar r_t2[Q1d]; 546ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 547241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 548241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 549241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 550241a4b83SYohann } 551241a4b83SYohann } 552241a4b83SYohann 553241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 554241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 555241a4b83SYohann CeedScalar *__restrict__ r_V) { 556241a4b83SYohann CeedScalar r_t1[Q1d]; 557241a4b83SYohann CeedScalar r_t2[Q1d]; 558ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 559241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 560241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 561241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 562241a4b83SYohann } 563241a4b83SYohann } 564241a4b83SYohann 565241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 566241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 567241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 568241a4b83SYohann CeedScalar *__restrict__ r_V) { 569241a4b83SYohann CeedScalar r_t1[Q1d]; 570241a4b83SYohann CeedScalar r_t2[Q1d]; 571ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 572241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 573241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 574241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 575241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 576241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 577241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 578241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 579241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 580241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 581241a4b83SYohann } 582241a4b83SYohann } 583241a4b83SYohann 584241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 585241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 586241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 587241a4b83SYohann CeedScalar *__restrict__ r_V) { 588241a4b83SYohann CeedScalar r_t1[Q1d]; 589241a4b83SYohann CeedScalar r_t2[Q1d]; 590ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 591241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 592241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 593241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 594241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 595241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 596241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 597241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 598241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 599241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 600241a4b83SYohann } 601241a4b83SYohann } 602241a4b83SYohann 603ac421f39SYohann template <int NCOMP, int Q1d> 604ac421f39SYohann inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, 605ac421f39SYohann const CeedScalar *__restrict__ r_U, 606ac421f39SYohann const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 607ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 608ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d]; 609ac421f39SYohann __syncthreads(); 610ac421f39SYohann // X derivative 611ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 612ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 613ac421f39SYohann r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 614ac421f39SYohann } 615ac421f39SYohann // Y derivative 616ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 617ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 618ac421f39SYohann r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 619ac421f39SYohann } 620ac421f39SYohann // Z derivative 621ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 622ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 623ac421f39SYohann r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative) 624ac421f39SYohann } 625ac421f39SYohann __syncthreads(); 626ac421f39SYohann } 627ac421f39SYohann } 628ac421f39SYohann 629ac421f39SYohann template <int NCOMP, int Q1d> 630ac421f39SYohann inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, 631ac421f39SYohann const CeedScalar *__restrict__ r_U, 632ac421f39SYohann const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 633ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 634ac421f39SYohann // X derivative 635ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 636ac421f39SYohann __syncthreads(); 637ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 638ac421f39SYohann r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 639ac421f39SYohann } 640ac421f39SYohann __syncthreads(); 641ac421f39SYohann // Y derivative 642ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 643ac421f39SYohann __syncthreads(); 644ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 645ac421f39SYohann r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 646ac421f39SYohann } 647ac421f39SYohann __syncthreads(); 648ac421f39SYohann // Z derivative 649ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 650ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative) 651ac421f39SYohann } 652ac421f39SYohann } 653ac421f39SYohann } 654ac421f39SYohann 655241a4b83SYohann template <int Q1d> 656241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 657241a4b83SYohann *w = qweight1d[data.tidx]; 658241a4b83SYohann } 659241a4b83SYohann 660241a4b83SYohann template <int Q1d> 661241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 662241a4b83SYohann *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 663241a4b83SYohann } 664241a4b83SYohann 665241a4b83SYohann template <int Q1d> 666241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 667241a4b83SYohann const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 668ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 669241a4b83SYohann { 670241a4b83SYohann w[z] = pw*qweight1d[z]; 671241a4b83SYohann } 672241a4b83SYohann } 673241a4b83SYohann 674241a4b83SYohann ); 675241a4b83SYohann 676241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 677241a4b83SYohann 678241a4b83SYohann using std::ostringstream; 679241a4b83SYohann using std::string; 680241a4b83SYohann int ierr; 681241a4b83SYohann bool setupdone; 682241a4b83SYohann ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 683241a4b83SYohann if (setupdone) return 0; 684241a4b83SYohann Ceed ceed; 685241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 686241a4b83SYohann CeedOperator_Cuda_gen *data; 687241a4b83SYohann ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 688241a4b83SYohann CeedQFunction qf; 689241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 690241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 691241a4b83SYohann ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 6921da99368SJeremy L Thompson CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 693*5c7b696cSjeremylt numoutputfields, ncomp, dim = 0, lsize; 694241a4b83SYohann ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 695241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 696241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 697241a4b83SYohann CeedChk(ierr); 698241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 699241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 700241a4b83SYohann CeedChk(ierr); 701241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 702241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 703241a4b83SYohann CeedChk(ierr); 704241a4b83SYohann CeedEvalMode emode; 705241a4b83SYohann CeedBasis basis; 706241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 707241a4b83SYohann CeedElemRestriction Erestrict; 708241a4b83SYohann CeedElemRestriction_Cuda_reg *restr_data; 709241a4b83SYohann 710241a4b83SYohann ostringstream code; 711241a4b83SYohann string devFunctions(deviceFunctions); 712241a4b83SYohann 713f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 714f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 715f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 716abfaacbbSSander Arens ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr); 717f1a13f77SYohann Dudouit ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 718f1a13f77SYohann Dudouit if (prop.major<6){ 719f1a13f77SYohann Dudouit code << atomicAdd; 720f1a13f77SYohann Dudouit } 721f1a13f77SYohann Dudouit 722241a4b83SYohann code << devFunctions; 723241a4b83SYohann 724241a4b83SYohann string qFunction(qf_data->qFunctionSource); 7254d537eeaSYohann 7264d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 727ee07ded2SValeria Barra code << "\n#define CeedPragmaSIMD\n"; 7281da99368SJeremy L Thompson 7291da99368SJeremy L Thompson // Find dim and Q1d 7301da99368SJeremy L Thompson bool collograd = false; 7311da99368SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 7321da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 7331da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 7341da99368SJeremy L Thompson ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 7351da99368SJeremy L Thompson 7361da99368SJeremy L Thompson // Check for collocated gradient 7371da99368SJeremy L Thompson if (basis_data->d_collograd1d) 7381da99368SJeremy L Thompson collograd = true; 7391da99368SJeremy L Thompson 7401da99368SJeremy L Thompson // Collect dim and Q1d 7411da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 7421da99368SJeremy L Thompson bool isTensor; 7431da99368SJeremy L Thompson ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 7441da99368SJeremy L Thompson if (isTensor) { 7451da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 7461da99368SJeremy L Thompson } else { 7471da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 7481da99368SJeremy L Thompson } 7491da99368SJeremy L Thompson } 7501da99368SJeremy L Thompson } 7511da99368SJeremy L Thompson // Check output bases for Q1d, dim as well 7521da99368SJeremy L Thompson // The only imput basis might be CEED_BASIS_COLLOCATED 7531da99368SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 7541da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 7551da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 7561da99368SJeremy L Thompson ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 7571da99368SJeremy L Thompson // Collect dim and Q1d 7581da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 7591da99368SJeremy L Thompson bool isTensor; 7601da99368SJeremy L Thompson ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 7611da99368SJeremy L Thompson if (isTensor) { 7621da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 7631da99368SJeremy L Thompson } else { 7641da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 7651da99368SJeremy L Thompson } 7661da99368SJeremy L Thompson } 7671da99368SJeremy L Thompson } 7681da99368SJeremy L Thompson data->dim = dim; 7691da99368SJeremy L Thompson data->Q1d = Q1d; 7701da99368SJeremy L Thompson 7711da99368SJeremy L Thompson // Define CEED_Q_VLA 7721da99368SJeremy L Thompson if (dim != 3 || collograd) { 7731da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA 1\n\n"; 7741da99368SJeremy L Thompson } else { 7751da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 7761da99368SJeremy L Thompson } 7771da99368SJeremy L Thompson 778241a4b83SYohann code << qFunction; 779241a4b83SYohann 780241a4b83SYohann // Setup 781d80fc06aSjeremylt code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 782241a4b83SYohann // Input Evecs and Restriction 783241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 784241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 785241a4b83SYohann CeedChk(ierr); 7861da99368SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 787241a4b83SYohann code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 788241a4b83SYohann } 789241a4b83SYohann } 790241a4b83SYohann 791241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 792241a4b83SYohann code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 793241a4b83SYohann } 7941da99368SJeremy L Thompson 795241a4b83SYohann code << "const CeedInt Dim = "<<dim<<";\n"; 796241a4b83SYohann code << "const CeedInt Q1d = "<<Q1d<<";\n"; 7971da99368SJeremy L Thompson 798241a4b83SYohann code << "extern __shared__ CeedScalar slice[];\n"; 799241a4b83SYohann code << "BackendData data;\n"; 800241a4b83SYohann code << "data.tidx = threadIdx.x;\n"; 801241a4b83SYohann code << "data.tidy = threadIdx.y;\n"; 802241a4b83SYohann code << "data.tidz = threadIdx.z;\n"; 803241a4b83SYohann code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 804241a4b83SYohann code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 805920dcdc4Sjeremylt 806920dcdc4Sjeremylt code << "\n// Input field constants and basis data\n"; 807ac421f39SYohann //Initialize constants, and matrices B and G 808241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 809920dcdc4Sjeremylt code << "// -- Input field "<<i<<" --\n"; 810241a4b83SYohann // Get elemsize, emode, ncomp 811241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 812241a4b83SYohann CeedChk(ierr); 813241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 814241a4b83SYohann CeedChk(ierr); 815241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 816241a4b83SYohann CeedChk(ierr); 8174d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 818241a4b83SYohann CeedChk(ierr); 819920dcdc4Sjeremylt 820920dcdc4Sjeremylt // Set field constants 821920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT) { 822920dcdc4Sjeremylt ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 823920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 824920dcdc4Sjeremylt ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 825920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 826920dcdc4Sjeremylt } else { 827920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 828920dcdc4Sjeremylt } 829920dcdc4Sjeremylt code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 830920dcdc4Sjeremylt } 831920dcdc4Sjeremylt 832920dcdc4Sjeremylt // Load basis data 833920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 834241a4b83SYohann switch (emode) { 835241a4b83SYohann case CEED_EVAL_NONE: 836241a4b83SYohann break; 837241a4b83SYohann case CEED_EVAL_INTERP: 838241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 839241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 840241a4b83SYohann code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 841241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 842241a4b83SYohann break; 843241a4b83SYohann case CEED_EVAL_GRAD: 844241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 845241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 846241a4b83SYohann code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 847241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 848ac421f39SYohann if (basis_data->d_collograd1d) { 849ac421f39SYohann data->G.in[i] = basis_data->d_collograd1d; 850ac421f39SYohann code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 851ac421f39SYohann code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 852ac421f39SYohann } else { 853ac421f39SYohann data->G.in[i] = basis_data->d_grad1d; 854ac421f39SYohann code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 855241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 856ac421f39SYohann } 857241a4b83SYohann break; 858241a4b83SYohann case CEED_EVAL_WEIGHT: 859241a4b83SYohann break; // No action 860241a4b83SYohann case CEED_EVAL_DIV: 861241a4b83SYohann break; // TODO: Not implemented 862241a4b83SYohann case CEED_EVAL_CURL: 863241a4b83SYohann break; // TODO: Not implemented 864241a4b83SYohann } 865241a4b83SYohann } 866920dcdc4Sjeremylt 867920dcdc4Sjeremylt code << "\n// Output field constants and basis data\n"; 868241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 869920dcdc4Sjeremylt code << "// -- Output field "<<i<<" --\n"; 870241a4b83SYohann // Get elemsize, emode, ncomp 871241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 872241a4b83SYohann CeedChk(ierr); 873241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 874241a4b83SYohann CeedChk(ierr); 875241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 876241a4b83SYohann CeedChk(ierr); 8774d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 878241a4b83SYohann CeedChk(ierr); 879920dcdc4Sjeremylt 880920dcdc4Sjeremylt // Set field constants 881241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 882920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 883241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 884241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 885920dcdc4Sjeremylt } else { 886920dcdc4Sjeremylt code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 887920dcdc4Sjeremylt } 888920dcdc4Sjeremylt code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 889920dcdc4Sjeremylt 890920dcdc4Sjeremylt // Load basis data 891920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 892920dcdc4Sjeremylt switch (emode) { 893920dcdc4Sjeremylt case CEED_EVAL_NONE: 894920dcdc4Sjeremylt break; // No action 895920dcdc4Sjeremylt case CEED_EVAL_INTERP: 896241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 897241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 898241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 899241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 900241a4b83SYohann break; 901241a4b83SYohann case CEED_EVAL_GRAD: 902241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 903241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 904241a4b83SYohann code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 905241a4b83SYohann code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 906ac421f39SYohann if (basis_data->d_collograd1d) { 907ac421f39SYohann data->G.out[i] = basis_data->d_collograd1d; 908ac421f39SYohann code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 909ac421f39SYohann code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 910ac421f39SYohann } else { 911ac421f39SYohann data->G.out[i] = basis_data->d_grad1d; 912ac421f39SYohann code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 913241a4b83SYohann code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 914ac421f39SYohann } 915241a4b83SYohann break; 916241a4b83SYohann case CEED_EVAL_WEIGHT: { 917241a4b83SYohann Ceed ceed; 918241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 919241a4b83SYohann return CeedError(ceed, 1, 920241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 921241a4b83SYohann break; // Should not occur 922241a4b83SYohann } 923241a4b83SYohann case CEED_EVAL_DIV: 924241a4b83SYohann break; // TODO: Not implemented 925241a4b83SYohann case CEED_EVAL_CURL: 926241a4b83SYohann break; // TODO: Not implemented 927241a4b83SYohann } 928241a4b83SYohann } 929ac421f39SYohann code << "\n"; 930ac421f39SYohann code << "__syncthreads();\n"; 931241a4b83SYohann code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 932241a4b83SYohann // Input basis apply if needed 933ac421f39SYohann // Generate the correct eval mode code for each input 934920dcdc4Sjeremylt code << "\n// Input field restrictions and basis actions\n"; 935241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 936920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 937241a4b83SYohann // Get elemsize, emode, ncomp 938241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 939241a4b83SYohann CeedChk(ierr); 940241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 941241a4b83SYohann CeedChk(ierr); 942241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 943241a4b83SYohann CeedChk(ierr); 9444d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 945241a4b83SYohann CeedChk(ierr); 946920dcdc4Sjeremylt 947920dcdc4Sjeremylt // Restriction 948920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT && 949920dcdc4Sjeremylt !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) { 950241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 951241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 952241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 953f2b2a896Sjeremylt if (data->indices.in[i]) { 954*5c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 955*5c7b696cSjeremylt CeedChk(ierr); 956*5c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 957*5c7b696cSjeremylt CeedInt compstride; 958*5c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 959*5c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 960*5c7b696cSjeremylt code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 961ccedf6b0Sjeremylt } else { 962920dcdc4Sjeremylt bool backendstrides; 963920dcdc4Sjeremylt ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 964920dcdc4Sjeremylt &backendstrides); 965920dcdc4Sjeremylt CeedChk(ierr); 966920dcdc4Sjeremylt CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 967920dcdc4Sjeremylt if (!backendstrides) { 968920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 969920dcdc4Sjeremylt CeedChk(ierr); 970ccedf6b0Sjeremylt } 971920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 972d80fc06aSjeremylt code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 973920dcdc4Sjeremylt } 974920dcdc4Sjeremylt } 975920dcdc4Sjeremylt 976920dcdc4Sjeremylt // Basis action 977920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 978920dcdc4Sjeremylt switch (emode) { 979920dcdc4Sjeremylt case CEED_EVAL_NONE: 980920dcdc4Sjeremylt if (!basis_data->d_collograd1d) { 981920dcdc4Sjeremylt code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 982920dcdc4Sjeremylt } 983920dcdc4Sjeremylt break; 984920dcdc4Sjeremylt case CEED_EVAL_INTERP: 985241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 986241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 987241a4b83SYohann break; 988241a4b83SYohann case CEED_EVAL_GRAD: 989ac421f39SYohann if (basis_data->d_collograd1d) { 990ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 991ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 992ac421f39SYohann } else { 993241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 994241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 995ac421f39SYohann } 996241a4b83SYohann break; 997241a4b83SYohann case CEED_EVAL_WEIGHT: 998241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 999241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1000241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1001241a4b83SYohann data->W = basis_data->d_qweight1d; 1002241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1003241a4b83SYohann break; // No action 1004241a4b83SYohann case CEED_EVAL_DIV: 1005241a4b83SYohann break; // TODO: Not implemented 1006241a4b83SYohann case CEED_EVAL_CURL: 1007241a4b83SYohann break; // TODO: Not implemented 1008241a4b83SYohann } 1009241a4b83SYohann } 1010ac421f39SYohann 1011241a4b83SYohann // Q function 1012920dcdc4Sjeremylt code << "\n// Output field setup\n"; 1013241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1014920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1015241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1016241a4b83SYohann CeedChk(ierr); 1017241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1018241a4b83SYohann { 1019ac421f39SYohann if (basis_data->d_collograd1d) { 1020ac421f39SYohann //Accumulator for gradient slices 1021ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1022ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1023ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1024ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1025ac421f39SYohann code << " }\n"; 1026ac421f39SYohann code << " }\n"; 1027ac421f39SYohann } else { 1028241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1029241a4b83SYohann } 1030ac421f39SYohann } 1031241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1032241a4b83SYohann { 1033241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1034241a4b83SYohann } 1035241a4b83SYohann } 1036ac421f39SYohann //We treat quadrature points per slice in 3d to save registers 1037ac421f39SYohann if (basis_data->d_collograd1d) { 1038920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1039ac421f39SYohann code << "#pragma unroll\n"; 1040ac421f39SYohann code << "for (CeedInt q=0; q<Q1d; q++) {\n"; 1041ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1042920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 1043ac421f39SYohann // Get elemsize, emode, ncomp 1044ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1045ac421f39SYohann CeedChk(ierr); 1046ac421f39SYohann // Basis action 1047920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1048ac421f39SYohann switch (emode) { 1049ac421f39SYohann case CEED_EVAL_NONE: 1050ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1051920dcdc4Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1052920dcdc4Sjeremylt data->indices.in[i] = restr_data->d_ind; 1053920dcdc4Sjeremylt if (data->indices.in[i]) { 1054*5c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1055*5c7b696cSjeremylt CeedChk(ierr); 1056*5c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 1057*5c7b696cSjeremylt CeedInt compstride; 1058*5c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1059*5c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1060*5c7b696cSjeremylt code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1061920dcdc4Sjeremylt } else { 1062920dcdc4Sjeremylt bool backendstrides; 1063920dcdc4Sjeremylt ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1064920dcdc4Sjeremylt &backendstrides); 1065920dcdc4Sjeremylt CeedChk(ierr); 1066920dcdc4Sjeremylt CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1067920dcdc4Sjeremylt if (!backendstrides) { 1068920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1069920dcdc4Sjeremylt CeedChk(ierr); 1070920dcdc4Sjeremylt } 1071920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1072d80fc06aSjeremylt code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1073920dcdc4Sjeremylt } 1074ac421f39SYohann break; 1075ac421f39SYohann case CEED_EVAL_INTERP: 1076ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1077ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1078ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1079ac421f39SYohann code << " }\n"; 1080ac421f39SYohann break; 1081ac421f39SYohann case CEED_EVAL_GRAD: 1082ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1083ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1084ac421f39SYohann break; 1085ac421f39SYohann case CEED_EVAL_WEIGHT: 1086ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1087ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1088ac421f39SYohann break; // No action 1089ac421f39SYohann case CEED_EVAL_DIV: 1090ac421f39SYohann break; // TODO: Not implemented 1091ac421f39SYohann case CEED_EVAL_CURL: 1092ac421f39SYohann break; // TODO: Not implemented 1093ac421f39SYohann } 1094ac421f39SYohann } 1095ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1096920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1097ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1098ac421f39SYohann CeedChk(ierr); 1099ac421f39SYohann // Basis action 1100ac421f39SYohann switch (emode) { 1101ac421f39SYohann case CEED_EVAL_NONE: 1102ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1103ac421f39SYohann break; // No action 1104ac421f39SYohann case CEED_EVAL_INTERP: 1105ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1106ac421f39SYohann break; 1107ac421f39SYohann case CEED_EVAL_GRAD: 1108ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1109ac421f39SYohann break; 1110ac421f39SYohann case CEED_EVAL_WEIGHT: 1111ac421f39SYohann break; // Should not occur 1112ac421f39SYohann case CEED_EVAL_DIV: 1113ac421f39SYohann break; // TODO: Not implemented 1114ac421f39SYohann case CEED_EVAL_CURL: 1115ac421f39SYohann break; // TODO: Not implemented 1116ac421f39SYohann } 1117ac421f39SYohann } 1118ac421f39SYohann } else { 1119920dcdc4Sjeremylt code << "\n // Note: No Collocated Gradient\n"; 1120ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1121920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 1122ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1123ac421f39SYohann } 1124ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1125920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1126ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1127ac421f39SYohann } 1128ac421f39SYohann } 1129920dcdc4Sjeremylt code << " // QFunction Inputs and outputs\n"; 11304d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 11314d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1132920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 1133ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 11344d537eeaSYohann } 11354d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 11364d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1137920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1138ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 11394d537eeaSYohann } 1140920dcdc4Sjeremylt code << "\n // Apply QFunction\n"; 1141241a4b83SYohann string qFunctionName(qf_data->qFunctionName); 1142ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 1143ac421f39SYohann if (dim != 3 || basis_data->d_collograd1d) { 1144ac421f39SYohann code << "1 "; 1145ac421f39SYohann } else { 1146ac421f39SYohann code << "Q1d "; 1147ac421f39SYohann } 1148ac421f39SYohann code << ", in, out);\n"; 1149ac421f39SYohann if (basis_data->d_collograd1d) { 1150920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1151ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1152920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1153ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1154ac421f39SYohann CeedChk(ierr); 1155ac421f39SYohann // Basis action 1156920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1157ac421f39SYohann switch (emode) { 1158ac421f39SYohann case CEED_EVAL_NONE: 1159ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1160ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1161ac421f39SYohann code << " }\n"; 1162ac421f39SYohann break; // No action 1163ac421f39SYohann case CEED_EVAL_INTERP: 1164ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1165ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1166ac421f39SYohann code << " }\n"; 1167ac421f39SYohann break; 1168ac421f39SYohann case CEED_EVAL_GRAD: 1169ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1170ac421f39SYohann break; 1171ac421f39SYohann case CEED_EVAL_WEIGHT: 1172ac421f39SYohann break; // Should not occur 1173ac421f39SYohann case CEED_EVAL_DIV: 1174ac421f39SYohann break; // TODO: Not implemented 1175ac421f39SYohann case CEED_EVAL_CURL: 1176ac421f39SYohann break; // TODO: Not implemented 1177ac421f39SYohann } 1178ac421f39SYohann } 1179ac421f39SYohann code << "}\n"; 1180ac421f39SYohann } 1181241a4b83SYohann 1182241a4b83SYohann // Output basis apply if needed 1183ac421f39SYohann // Generate the correct eval mode code for each output 1184920dcdc4Sjeremylt code << "\n// Output field basis action and restrictions\n"; 1185241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1186920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1187241a4b83SYohann // Get elemsize, emode, ncomp 1188241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1189241a4b83SYohann CeedChk(ierr); 1190241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1191241a4b83SYohann CeedChk(ierr); 1192241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1193241a4b83SYohann CeedChk(ierr); 11944d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1195241a4b83SYohann CeedChk(ierr); 1196241a4b83SYohann // Basis action 1197920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1198241a4b83SYohann switch (emode) { 1199241a4b83SYohann case CEED_EVAL_NONE: 1200920dcdc4Sjeremylt code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1201241a4b83SYohann break; // No action 1202241a4b83SYohann case CEED_EVAL_INTERP: 1203241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1204241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1205241a4b83SYohann break; 1206241a4b83SYohann case CEED_EVAL_GRAD: 1207241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1208ac421f39SYohann if (basis_data->d_collograd1d) { 1209ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1210ac421f39SYohann } else { 1211241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1212ac421f39SYohann } 1213241a4b83SYohann break; 1214241a4b83SYohann case CEED_EVAL_WEIGHT: { 1215241a4b83SYohann Ceed ceed; 1216241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1217241a4b83SYohann return CeedError(ceed, 1, 1218241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1219241a4b83SYohann break; // Should not occur 1220241a4b83SYohann } 1221241a4b83SYohann case CEED_EVAL_DIV: 1222241a4b83SYohann break; // TODO: Not implemented 1223241a4b83SYohann case CEED_EVAL_CURL: 1224241a4b83SYohann break; // TODO: Not implemented 1225241a4b83SYohann } 1226920dcdc4Sjeremylt // Restriction 1227920dcdc4Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1228920dcdc4Sjeremylt data->indices.out[i] = restr_data->d_ind; 1229920dcdc4Sjeremylt if (data->indices.out[i]) { 1230*5c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1231*5c7b696cSjeremylt CeedChk(ierr); 1232*5c7b696cSjeremylt code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 1233*5c7b696cSjeremylt CeedInt compstride; 1234*5c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 1235*5c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1236*5c7b696cSjeremylt code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1237920dcdc4Sjeremylt } else { 1238920dcdc4Sjeremylt bool backendstrides; 1239920dcdc4Sjeremylt ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1240920dcdc4Sjeremylt &backendstrides); 1241920dcdc4Sjeremylt CeedChk(ierr); 1242920dcdc4Sjeremylt CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1243920dcdc4Sjeremylt if (!backendstrides) { 1244920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1245920dcdc4Sjeremylt CeedChk(ierr); 1246920dcdc4Sjeremylt } 1247920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1248d80fc06aSjeremylt code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1249920dcdc4Sjeremylt } 1250241a4b83SYohann } 1251241a4b83SYohann 1252241a4b83SYohann code << " }\n"; 1253241a4b83SYohann code << "}\n\n"; 1254241a4b83SYohann 1255241a4b83SYohann // std::cout << code.str(); 1256241a4b83SYohann 1257920dcdc4Sjeremylt ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); 1258920dcdc4Sjeremylt CeedChk(ierr); 1259241a4b83SYohann ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1260241a4b83SYohann CeedChk(ierr); 1261241a4b83SYohann 1262241a4b83SYohann ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1263241a4b83SYohann 1264241a4b83SYohann return 0; 1265241a4b83SYohann } 1266