1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16241a4b83SYohann #include <ceed-backend.h> 17241a4b83SYohann #include "ceed-cuda-gen.h" 18241a4b83SYohann #include <iostream> 19241a4b83SYohann #include <sstream> 20241a4b83SYohann #include "../cuda/ceed-cuda.h" 21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h" 22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 23241a4b83SYohann 24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 25241a4b83SYohann __device__ double atomicAdd(double *address, double val) { 26241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 27241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 28241a4b83SYohann do { 29241a4b83SYohann assumed = old; 30241a4b83SYohann old = 31241a4b83SYohann atomicCAS(address_as_ull, assumed, 32241a4b83SYohann __double_as_longlong(val + 33241a4b83SYohann __longlong_as_double(assumed))); 34241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 35241a4b83SYohann // (since NaN != NaN) 36241a4b83SYohann } while (assumed != old); 37241a4b83SYohann return __longlong_as_double(old); 38241a4b83SYohann } 39f1a13f77SYohann Dudouit ); 40f1a13f77SYohann Dudouit 41f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 42f1a13f77SYohann Dudouit 43f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45f1a13f77SYohann Dudouit 46f1a13f77SYohann Dudouit typedef struct { 47f1a13f77SYohann Dudouit CeedInt tidx; 48f1a13f77SYohann Dudouit CeedInt tidy; 49f1a13f77SYohann Dudouit CeedInt tidz; 50f1a13f77SYohann Dudouit CeedInt tid; 51f1a13f77SYohann Dudouit CeedScalar* slice; 52f1a13f77SYohann Dudouit } BackendData; 53241a4b83SYohann 54241a4b83SYohann template <int P, int Q> 55241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56ac421f39SYohann for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57241a4b83SYohann B[i] = d_B[i]; 58241a4b83SYohann } 59241a4b83SYohann } 60241a4b83SYohann 61241a4b83SYohann //**** 62241a4b83SYohann // 1D 63241a4b83SYohann template <int NCOMP, int P1d> 64*ccedf6b0Sjeremylt inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 65241a4b83SYohann if (data.tidx<P1d) 66241a4b83SYohann { 674d537eeaSYohann const CeedInt node = data.tidx; 68*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 69241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 708795c945Sjeremylt r_u[comp] = d_u[ind + nnodes * comp]; 71241a4b83SYohann } 72241a4b83SYohann } 73241a4b83SYohann } 74241a4b83SYohann 75241a4b83SYohann template <int NCOMP, int P1d> 76*ccedf6b0Sjeremylt inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77241a4b83SYohann if (data.tidx<P1d) 78241a4b83SYohann { 794d537eeaSYohann const CeedInt node = data.tidx; 80*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 81241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 82241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 83241a4b83SYohann } 84241a4b83SYohann } 85241a4b83SYohann } 86*ccedf6b0Sjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 87*ccedf6b0Sjeremylt if (data.tidx<P1d) 88*ccedf6b0Sjeremylt { 89*ccedf6b0Sjeremylt const CeedInt node = data.tidx; 90*ccedf6b0Sjeremylt const CeedInt ind = node * strides[0] + elem * strides[2]; 91*ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 92*ccedf6b0Sjeremylt r_u[comp] = d_u[ind + comp * strides[1]]; 93*ccedf6b0Sjeremylt } 94*ccedf6b0Sjeremylt } 95*ccedf6b0Sjeremylt } 96241a4b83SYohann 97241a4b83SYohann template <int NCOMP, int Q1d> 98241a4b83SYohann inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 994d537eeaSYohann const CeedInt node = data.tidx; 1004d537eeaSYohann const CeedInt ind = node + elem * Q1d; 101241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 102241a4b83SYohann r_u[comp] = d_u[ind + nquads * comp]; 103241a4b83SYohann } 104241a4b83SYohann } 105241a4b83SYohann 106241a4b83SYohann template <int NCOMP, int Q1d> 107241a4b83SYohann inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 1084d537eeaSYohann const CeedInt node = data.tidx; 1094d537eeaSYohann const CeedInt ind = node + elem * Q1d; 110241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 111241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 112241a4b83SYohann } 113241a4b83SYohann } 114241a4b83SYohann 115241a4b83SYohann template <int NCOMP, int P1d> 116*ccedf6b0Sjeremylt inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 117241a4b83SYohann if (data.tidx<P1d) 118241a4b83SYohann { 1194d537eeaSYohann const CeedInt node = data.tidx; 120*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 121241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 1228795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 123241a4b83SYohann } 124241a4b83SYohann } 125241a4b83SYohann } 126241a4b83SYohann 127241a4b83SYohann template <int NCOMP, int P1d> 128*ccedf6b0Sjeremylt inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 129241a4b83SYohann if (data.tidx<P1d) 130241a4b83SYohann { 1314d537eeaSYohann const CeedInt node = data.tidx; 132*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 133241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 134241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 135241a4b83SYohann } 136241a4b83SYohann } 137241a4b83SYohann } 138241a4b83SYohann 139*ccedf6b0Sjeremylt template <int NCOMP, int P1d> 140*ccedf6b0Sjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 141*ccedf6b0Sjeremylt if (data.tidx<P1d) 142*ccedf6b0Sjeremylt { 143*ccedf6b0Sjeremylt const CeedInt node = data.tidx; 144*ccedf6b0Sjeremylt const CeedInt ind = node * strides[0] + elem * strides[2]; 145*ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 146*ccedf6b0Sjeremylt atomicAdd(&d_v[ind + comp * strides[1]], r_v[comp]); 147*ccedf6b0Sjeremylt } 148*ccedf6b0Sjeremylt } 149*ccedf6b0Sjeremylt } 150*ccedf6b0Sjeremylt 151241a4b83SYohann template <int NCOMP, int Q1d> 152241a4b83SYohann inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 1534d537eeaSYohann const CeedInt node = data.tidx; 1544d537eeaSYohann const CeedInt ind = node + elem * Q1d; 155241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 156241a4b83SYohann d_v[ind + nquads * comp] = r_v[comp]; 157241a4b83SYohann } 158241a4b83SYohann } 159241a4b83SYohann 160241a4b83SYohann template <int NCOMP, int Q1d> 161241a4b83SYohann inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 1624d537eeaSYohann const CeedInt node = data.tidx; 1634d537eeaSYohann const CeedInt ind = node + elem * Q1d; 164241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 165241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[comp]; 166241a4b83SYohann } 167241a4b83SYohann } 168241a4b83SYohann 169241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 170241a4b83SYohann inline __device__ void ContractX1d(BackendData& data, 171241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 172241a4b83SYohann data.slice[data.tidx] = *U; 173241a4b83SYohann __syncthreads(); 174241a4b83SYohann *V = 0.0; 175ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 176241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 177241a4b83SYohann } 178241a4b83SYohann __syncthreads(); 179241a4b83SYohann } 180241a4b83SYohann 181241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 182241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data, 183241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 184241a4b83SYohann data.slice[data.tidx] = *U; 185241a4b83SYohann __syncthreads(); 186241a4b83SYohann *V = 0.0; 187ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 188241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 189241a4b83SYohann } 190241a4b83SYohann __syncthreads(); 191241a4b83SYohann } 192241a4b83SYohann 193241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 194241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 195241a4b83SYohann CeedScalar *__restrict__ r_V) { 196ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 197241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 198241a4b83SYohann } 199241a4b83SYohann } 200241a4b83SYohann 201241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 202241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 203241a4b83SYohann CeedScalar *__restrict__ r_V) { 204ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 205241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 206241a4b83SYohann } 207241a4b83SYohann } 208241a4b83SYohann 209241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 210241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 211241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 212241a4b83SYohann CeedScalar *__restrict__ r_V) { 213ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 214241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 215241a4b83SYohann } 216241a4b83SYohann } 217241a4b83SYohann 218241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 219241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 220241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 221241a4b83SYohann CeedScalar *__restrict__ r_V) { 222ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 223241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 224241a4b83SYohann } 225241a4b83SYohann } 226241a4b83SYohann 227241a4b83SYohann //**** 228241a4b83SYohann // 2D 229241a4b83SYohann template <int NCOMP, int P1d> 230*ccedf6b0Sjeremylt inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 231241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 232241a4b83SYohann { 2334d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 234*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 235241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 2368795c945Sjeremylt r_u[comp] = d_u[ind + nnodes * comp]; 237241a4b83SYohann } 238241a4b83SYohann } 239241a4b83SYohann } 240241a4b83SYohann 241241a4b83SYohann template <int NCOMP, int P1d> 242*ccedf6b0Sjeremylt inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 243241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 244241a4b83SYohann { 2454d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 246*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 247241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 248241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 249241a4b83SYohann } 250241a4b83SYohann } 251241a4b83SYohann } 252*ccedf6b0Sjeremylt template <int NCOMP, int P1d> 253*ccedf6b0Sjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 254*ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) 255*ccedf6b0Sjeremylt { 256*ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 257*ccedf6b0Sjeremylt const CeedInt ind = node * strides[0] + elem * strides[2]; 258*ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 259*ccedf6b0Sjeremylt r_u[comp] = d_u[ind + comp * strides[1]]; 260*ccedf6b0Sjeremylt } 261*ccedf6b0Sjeremylt } 262*ccedf6b0Sjeremylt } 263241a4b83SYohann 264241a4b83SYohann template <int NCOMP, int Q1d> 265241a4b83SYohann inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 2664d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 2674d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 268241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 269241a4b83SYohann r_u[comp] = d_u[ind + nquads * comp]; 270241a4b83SYohann } 271241a4b83SYohann } 272241a4b83SYohann 273241a4b83SYohann template <int NCOMP, int Q1d> 274241a4b83SYohann inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 2754d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 2764d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 277241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 278241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 279241a4b83SYohann } 280241a4b83SYohann } 281241a4b83SYohann 282241a4b83SYohann template <int NCOMP, int P1d> 283*ccedf6b0Sjeremylt inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 284241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 285241a4b83SYohann { 2864d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 287*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 288241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 2898795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 290241a4b83SYohann } 291241a4b83SYohann } 292241a4b83SYohann } 293241a4b83SYohann 294241a4b83SYohann template <int NCOMP, int P1d> 295*ccedf6b0Sjeremylt inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 296241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 297241a4b83SYohann { 2984d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 299*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 300241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 301241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 302241a4b83SYohann } 303241a4b83SYohann } 304241a4b83SYohann } 305241a4b83SYohann 306*ccedf6b0Sjeremylt template <int NCOMP, int P1d> 307*ccedf6b0Sjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 308*ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) 309*ccedf6b0Sjeremylt { 310*ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 311*ccedf6b0Sjeremylt const CeedInt ind = node * strides[0] + elem * strides[2]; 312*ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 313*ccedf6b0Sjeremylt atomicAdd(&d_v[ind + comp * strides[1]], r_v[comp]); 314*ccedf6b0Sjeremylt } 315*ccedf6b0Sjeremylt } 316*ccedf6b0Sjeremylt } 317*ccedf6b0Sjeremylt 318241a4b83SYohann template <int NCOMP, int Q1d> 319241a4b83SYohann inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 3204d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 3214d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 322241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 323241a4b83SYohann d_v[ind + nquads * comp] = r_v[comp]; 324241a4b83SYohann } 325241a4b83SYohann } 326241a4b83SYohann 327241a4b83SYohann template <int NCOMP, int Q1d> 328241a4b83SYohann inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 3294d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 3304d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 331241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 332241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[comp]; 333241a4b83SYohann } 334241a4b83SYohann } 335241a4b83SYohann 336241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 337241a4b83SYohann inline __device__ void ContractX2d(BackendData& data, 338241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 339241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 340241a4b83SYohann __syncthreads(); 341241a4b83SYohann *V = 0.0; 342ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 343241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 344241a4b83SYohann } 345241a4b83SYohann __syncthreads(); 346241a4b83SYohann } 347241a4b83SYohann 348241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 349241a4b83SYohann inline __device__ void ContractY2d(BackendData& data, 350241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 351241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 352241a4b83SYohann __syncthreads(); 353241a4b83SYohann *V = 0.0; 354ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 355241a4b83SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 356241a4b83SYohann } 357241a4b83SYohann __syncthreads(); 358241a4b83SYohann } 359241a4b83SYohann 360241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 361241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data, 362241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 363241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 364241a4b83SYohann __syncthreads(); 365241a4b83SYohann *V = 0.0; 366241a4b83SYohann if (data.tidy<P1d) { 367ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 368241a4b83SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 369241a4b83SYohann } 370241a4b83SYohann } 371241a4b83SYohann __syncthreads(); 372241a4b83SYohann } 373241a4b83SYohann 374241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 375241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data, 376241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 377241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 378241a4b83SYohann __syncthreads(); 379241a4b83SYohann *V = 0.0; 380241a4b83SYohann if (data.tidx<P1d) { 381ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 382241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 383241a4b83SYohann } 384241a4b83SYohann } 385241a4b83SYohann __syncthreads(); 386241a4b83SYohann } 387241a4b83SYohann 388241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 389241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data, 390241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 391241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 392241a4b83SYohann __syncthreads(); 393241a4b83SYohann if (data.tidx<P1d) { 394ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 395241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 396241a4b83SYohann } 397241a4b83SYohann } 398241a4b83SYohann __syncthreads(); 399241a4b83SYohann } 400241a4b83SYohann 401241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 402241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 403241a4b83SYohann CeedScalar *__restrict__ r_V) { 404241a4b83SYohann CeedScalar r_t[1]; 405ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 406241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 407241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 408241a4b83SYohann } 409241a4b83SYohann } 410241a4b83SYohann 411241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 412241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 413241a4b83SYohann CeedScalar *__restrict__ r_V) { 414241a4b83SYohann CeedScalar r_t[1]; 415ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 416241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 417241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 418241a4b83SYohann } 419241a4b83SYohann } 420241a4b83SYohann 421241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 422241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 423241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 424241a4b83SYohann CeedScalar *__restrict__ r_V) { 425241a4b83SYohann CeedScalar r_t[1]; 426ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 427241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 428241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 429241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 430241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 431241a4b83SYohann } 432241a4b83SYohann } 433241a4b83SYohann 434241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 435241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 436241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 437241a4b83SYohann CeedScalar *__restrict__ r_V) { 438241a4b83SYohann CeedScalar r_t[1]; 439ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 440241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 441241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 442241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 443241a4b83SYohann ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 444241a4b83SYohann } 445241a4b83SYohann } 446241a4b83SYohann 447241a4b83SYohann //**** 448241a4b83SYohann // 3D 449241a4b83SYohann template <int NCOMP, int P1d> 450*ccedf6b0Sjeremylt inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 451241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 452241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4534d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 454*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 455241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 4568795c945Sjeremylt r_u[z+comp*P1d] = d_u[ind + nnodes * comp]; 457241a4b83SYohann } 458241a4b83SYohann } 459241a4b83SYohann } 460241a4b83SYohann } 461241a4b83SYohann 462241a4b83SYohann template <int NCOMP, int P1d> 463*ccedf6b0Sjeremylt inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 464241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 465241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4664d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 467*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 468241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 469241a4b83SYohann r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 470241a4b83SYohann } 471241a4b83SYohann } 472241a4b83SYohann } 473241a4b83SYohann } 474*ccedf6b0Sjeremylt template <int NCOMP, int P1d> 475*ccedf6b0Sjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 476*ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) { 477*ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 478*ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 479*ccedf6b0Sjeremylt const CeedInt ind = node * strides[0] + elem * strides[2]; 480*ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 481*ccedf6b0Sjeremylt r_u[z+comp*P1d] = d_u[ind + comp * strides[1]]; 482*ccedf6b0Sjeremylt } 483*ccedf6b0Sjeremylt } 484*ccedf6b0Sjeremylt } 485*ccedf6b0Sjeremylt } 486241a4b83SYohann 487241a4b83SYohann template <int NCOMP, int Q1d> 488241a4b83SYohann inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 489241a4b83SYohann for (CeedInt z=0; z < Q1d; ++z) { 4904d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 4914d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 492241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 493241a4b83SYohann r_u[z+comp*Q1d] = d_u[ind + nquads * comp]; 494241a4b83SYohann } 495241a4b83SYohann } 496241a4b83SYohann } 497241a4b83SYohann 498241a4b83SYohann template <int NCOMP, int Q1d> 499ac421f39SYohann inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 500ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 501ac421f39SYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 502ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 503ac421f39SYohann r_u[comp] = d_u[ind + nquads * comp]; 504ac421f39SYohann } 505ac421f39SYohann } 506ac421f39SYohann 507ac421f39SYohann template <int NCOMP, int Q1d> 508241a4b83SYohann inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 509241a4b83SYohann for (CeedInt z=0; z < Q1d; ++z) { 5104d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 5114d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 512241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 513241a4b83SYohann r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp]; 514241a4b83SYohann } 515241a4b83SYohann } 516241a4b83SYohann } 517241a4b83SYohann 518ac421f39SYohann template <int NCOMP, int Q1d> 519ac421f39SYohann inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 520ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 521ac421f39SYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 522ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 523ac421f39SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 524ac421f39SYohann } 525ac421f39SYohann } 526ac421f39SYohann 527241a4b83SYohann template <int NCOMP, int P1d> 528*ccedf6b0Sjeremylt inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 529241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 530241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 5314d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 532*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 533241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 5348795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]); 535241a4b83SYohann } 536241a4b83SYohann } 537241a4b83SYohann } 538241a4b83SYohann } 539241a4b83SYohann 540241a4b83SYohann template <int NCOMP, int P1d> 541*ccedf6b0Sjeremylt inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 542241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 543241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 5444d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 545*ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 546241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 547241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 548241a4b83SYohann } 549241a4b83SYohann } 550241a4b83SYohann } 551241a4b83SYohann } 552241a4b83SYohann 553*ccedf6b0Sjeremylt template <int NCOMP, int P1d> 554*ccedf6b0Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt strides[3], const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 555*ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) { 556*ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 557*ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 558*ccedf6b0Sjeremylt const CeedInt ind = node * strides[0] + elem * strides[2]; 559*ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 560*ccedf6b0Sjeremylt atomicAdd(&d_v[ind + comp * strides[1]], r_v[z+comp*P1d]); 561*ccedf6b0Sjeremylt } 562*ccedf6b0Sjeremylt } 563*ccedf6b0Sjeremylt } 564*ccedf6b0Sjeremylt } 565*ccedf6b0Sjeremylt 566241a4b83SYohann template <int NCOMP, int Q1d> 567241a4b83SYohann inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 568241a4b83SYohann for (CeedInt z=0; z < Q1d; ++z) { 5694d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 5704d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 571241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 572241a4b83SYohann d_v[ind + nquads * comp] = r_v[z+comp*Q1d]; 573241a4b83SYohann } 574241a4b83SYohann } 575241a4b83SYohann } 576241a4b83SYohann 577241a4b83SYohann template <int NCOMP, int Q1d> 578241a4b83SYohann inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 579241a4b83SYohann for (CeedInt z=0; z < Q1d; ++z) { 5804d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 5814d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 582241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 583241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d]; 584241a4b83SYohann } 585241a4b83SYohann } 586241a4b83SYohann } 587241a4b83SYohann 588241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 589241a4b83SYohann inline __device__ void ContractX3d(BackendData& data, 590241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 591ac421f39SYohann CeedScalar r_B[P1d]; 592ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 593ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 594ac421f39SYohann } 595ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 596241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 597241a4b83SYohann __syncthreads(); 598241a4b83SYohann V[k] = 0.0; 599ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 600ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 601241a4b83SYohann } 602241a4b83SYohann __syncthreads(); 603241a4b83SYohann } 604241a4b83SYohann } 605241a4b83SYohann 606241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 607241a4b83SYohann inline __device__ void ContractY3d(BackendData& data, 608241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 609ac421f39SYohann CeedScalar r_B[P1d]; 610ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 611ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 612ac421f39SYohann } 613ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 614241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 615241a4b83SYohann __syncthreads(); 616241a4b83SYohann V[k] = 0.0; 617ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 618ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 619241a4b83SYohann } 620241a4b83SYohann __syncthreads(); 621241a4b83SYohann } 622241a4b83SYohann } 623241a4b83SYohann 624241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 625241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data, 626241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 627ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 628241a4b83SYohann V[k] = 0.0; 629ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 630241a4b83SYohann V[k] += B[i + k*P1d] * U[i];//contract z direction 631241a4b83SYohann } 632241a4b83SYohann } 633241a4b83SYohann } 634241a4b83SYohann 635241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 636241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data, 637241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 638ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 639241a4b83SYohann V[k] = 0.0; 640241a4b83SYohann if (k<P1d) { 641ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 642241a4b83SYohann V[k] += B[k + i*P1d] * U[i];//contract z direction 643241a4b83SYohann } 644241a4b83SYohann } 645241a4b83SYohann } 646241a4b83SYohann } 647241a4b83SYohann 648241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 649241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data, 650241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 651ac421f39SYohann CeedScalar r_B[Q1d]; 652ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 653ac421f39SYohann { 654ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 655ac421f39SYohann } 656ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 657241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 658241a4b83SYohann __syncthreads(); 659241a4b83SYohann V[k] = 0.0; 660241a4b83SYohann if (data.tidy<P1d) { 661ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 662ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 663ac421f39SYohann } 664ac421f39SYohann } 665ac421f39SYohann __syncthreads(); 666ac421f39SYohann } 667ac421f39SYohann } 668ac421f39SYohann 669ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 670ac421f39SYohann inline __device__ void ContractTransposeAddY3d(BackendData& data, 671ac421f39SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 672ac421f39SYohann CeedScalar r_B[Q1d]; 673ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 674ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 675ac421f39SYohann } 676ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 677ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 678ac421f39SYohann __syncthreads(); 679ac421f39SYohann if (data.tidy<P1d) { 680ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 681ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 682241a4b83SYohann } 683241a4b83SYohann } 684241a4b83SYohann __syncthreads(); 685241a4b83SYohann } 686241a4b83SYohann } 687241a4b83SYohann 688241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 689241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data, 690241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 691ac421f39SYohann CeedScalar r_B[Q1d]; 692ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 693ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 694ac421f39SYohann } 695ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 696241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 697241a4b83SYohann __syncthreads(); 698241a4b83SYohann V[k] = 0.0; 699241a4b83SYohann if (data.tidx<P1d) { 700ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 701ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 702241a4b83SYohann } 703241a4b83SYohann } 704241a4b83SYohann __syncthreads(); 705241a4b83SYohann } 706241a4b83SYohann } 707241a4b83SYohann 708241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 709241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data, 710241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 711ac421f39SYohann CeedScalar r_B[Q1d]; 712ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 713ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 714ac421f39SYohann } 715ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 716241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 717241a4b83SYohann __syncthreads(); 718241a4b83SYohann if (data.tidx<P1d) { 719ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 720ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 721241a4b83SYohann } 722241a4b83SYohann } 723241a4b83SYohann __syncthreads(); 724241a4b83SYohann } 725241a4b83SYohann } 726241a4b83SYohann 727241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 728241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 729241a4b83SYohann CeedScalar *__restrict__ r_V) { 730241a4b83SYohann CeedScalar r_t1[Q1d]; 731241a4b83SYohann CeedScalar r_t2[Q1d]; 732ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 733241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 734241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 735241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 736241a4b83SYohann } 737241a4b83SYohann } 738241a4b83SYohann 739241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 740241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 741241a4b83SYohann CeedScalar *__restrict__ r_V) { 742241a4b83SYohann CeedScalar r_t1[Q1d]; 743241a4b83SYohann CeedScalar r_t2[Q1d]; 744ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 745241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 746241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 747241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 748241a4b83SYohann } 749241a4b83SYohann } 750241a4b83SYohann 751241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 752241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 753241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 754241a4b83SYohann CeedScalar *__restrict__ r_V) { 755241a4b83SYohann CeedScalar r_t1[Q1d]; 756241a4b83SYohann CeedScalar r_t2[Q1d]; 757ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 758241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 759241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 760241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 761241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 762241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 763241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 764241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 765241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 766241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 767241a4b83SYohann } 768241a4b83SYohann } 769241a4b83SYohann 770241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 771241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 772241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 773241a4b83SYohann CeedScalar *__restrict__ r_V) { 774241a4b83SYohann CeedScalar r_t1[Q1d]; 775241a4b83SYohann CeedScalar r_t2[Q1d]; 776ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 777241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 778241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 779241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 780241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 781241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 782241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 783241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 784241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 785241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 786241a4b83SYohann } 787241a4b83SYohann } 788241a4b83SYohann 789ac421f39SYohann template <int NCOMP, int Q1d> 790ac421f39SYohann inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, 791ac421f39SYohann const CeedScalar *__restrict__ r_U, 792ac421f39SYohann const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 793ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 794ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d]; 795ac421f39SYohann __syncthreads(); 796ac421f39SYohann // X derivative 797ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 798ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 799ac421f39SYohann r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 800ac421f39SYohann } 801ac421f39SYohann // Y derivative 802ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 803ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 804ac421f39SYohann r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 805ac421f39SYohann } 806ac421f39SYohann // Z derivative 807ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 808ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 809ac421f39SYohann r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative) 810ac421f39SYohann } 811ac421f39SYohann __syncthreads(); 812ac421f39SYohann } 813ac421f39SYohann } 814ac421f39SYohann 815ac421f39SYohann template <int NCOMP, int Q1d> 816ac421f39SYohann inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, 817ac421f39SYohann const CeedScalar *__restrict__ r_U, 818ac421f39SYohann const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 819ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 820ac421f39SYohann // X derivative 821ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 822ac421f39SYohann __syncthreads(); 823ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 824ac421f39SYohann r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 825ac421f39SYohann } 826ac421f39SYohann __syncthreads(); 827ac421f39SYohann // Y derivative 828ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 829ac421f39SYohann __syncthreads(); 830ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 831ac421f39SYohann r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 832ac421f39SYohann } 833ac421f39SYohann __syncthreads(); 834ac421f39SYohann // Z derivative 835ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 836ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative) 837ac421f39SYohann } 838ac421f39SYohann } 839ac421f39SYohann } 840ac421f39SYohann 841241a4b83SYohann template <int Q1d> 842241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 843241a4b83SYohann *w = qweight1d[data.tidx]; 844241a4b83SYohann } 845241a4b83SYohann 846241a4b83SYohann template <int Q1d> 847241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 848241a4b83SYohann *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 849241a4b83SYohann } 850241a4b83SYohann 851241a4b83SYohann template <int Q1d> 852241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 853241a4b83SYohann const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 854ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 855241a4b83SYohann { 856241a4b83SYohann w[z] = pw*qweight1d[z]; 857241a4b83SYohann } 858241a4b83SYohann } 859241a4b83SYohann 860241a4b83SYohann ); 861241a4b83SYohann 862241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 863241a4b83SYohann 864241a4b83SYohann using std::ostringstream; 865241a4b83SYohann using std::string; 866241a4b83SYohann int ierr; 867241a4b83SYohann bool setupdone; 868241a4b83SYohann ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 869241a4b83SYohann if (setupdone) return 0; 870241a4b83SYohann Ceed ceed; 871241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 872241a4b83SYohann CeedOperator_Cuda_gen *data; 873241a4b83SYohann ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 874241a4b83SYohann CeedQFunction qf; 875241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 876241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 877241a4b83SYohann ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 8788795c945Sjeremylt CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes; 879241a4b83SYohann ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 880241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 881241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 882241a4b83SYohann CeedChk(ierr); 883241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 884241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 885241a4b83SYohann CeedChk(ierr); 886241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 887241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 888241a4b83SYohann CeedChk(ierr); 889241a4b83SYohann CeedEvalMode emode; 89061dbc9d2Sjeremylt CeedInterlaceMode imode; 891241a4b83SYohann CeedBasis basis; 892241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 893241a4b83SYohann CeedElemRestriction Erestrict; 894241a4b83SYohann CeedElemRestriction_Cuda_reg *restr_data; 895241a4b83SYohann 896241a4b83SYohann ostringstream code; 897241a4b83SYohann string devFunctions(deviceFunctions); 898241a4b83SYohann 899f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 900f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 901f1a13f77SYohann Dudouit Ceed delegate; 902f1a13f77SYohann Dudouit CeedGetDelegate(ceed, &delegate); 903f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 904f1a13f77SYohann Dudouit ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr); 905f1a13f77SYohann Dudouit ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 906f1a13f77SYohann Dudouit if (prop.major<6){ 907f1a13f77SYohann Dudouit code << atomicAdd; 908f1a13f77SYohann Dudouit } 909f1a13f77SYohann Dudouit 910241a4b83SYohann code << devFunctions; 911241a4b83SYohann 912241a4b83SYohann string qFunction(qf_data->qFunctionSource); 9134d537eeaSYohann 9144d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 915ee07ded2SValeria Barra code << "\n#define CeedPragmaSIMD\n"; 916241a4b83SYohann code << qFunction; 917241a4b83SYohann 918241a4b83SYohann // Setup 919241a4b83SYohann code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 920241a4b83SYohann // Input Evecs and Restriction 921241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 922241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 923241a4b83SYohann CeedChk(ierr); 924241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 925241a4b83SYohann } else { 926241a4b83SYohann code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 927241a4b83SYohann if (emode != CEED_EVAL_NONE) 928241a4b83SYohann { 929241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 930241a4b83SYohann bool isTensor; 931241a4b83SYohann ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 932241a4b83SYohann //TODO check that all are the same 933241a4b83SYohann ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 934241a4b83SYohann if (isTensor) 935241a4b83SYohann { 936241a4b83SYohann //TODO check that all are the same 937241a4b83SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 938241a4b83SYohann } else { 939241a4b83SYohann return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 940241a4b83SYohann } 941241a4b83SYohann } 942241a4b83SYohann } 943241a4b83SYohann } 944241a4b83SYohann data->dim = dim; 945241a4b83SYohann data->Q1d = Q1d; 946241a4b83SYohann 947241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 948241a4b83SYohann code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 949241a4b83SYohann } 950241a4b83SYohann code << "const CeedInt Dim = "<<dim<<";\n"; 951241a4b83SYohann code << "const CeedInt Q1d = "<<Q1d<<";\n"; 952241a4b83SYohann // code << "const CeedInt Q = "<<Q<<";\n"; 953241a4b83SYohann code << "extern __shared__ CeedScalar slice[];\n"; 954241a4b83SYohann code << "BackendData data;\n"; 955241a4b83SYohann code << "data.tidx = threadIdx.x;\n"; 956241a4b83SYohann code << "data.tidy = threadIdx.y;\n"; 957241a4b83SYohann code << "data.tidz = threadIdx.z;\n"; 958241a4b83SYohann code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 959241a4b83SYohann code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 960ac421f39SYohann //Initialize constants, and matrices B and G 961241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 962241a4b83SYohann code << "// Input field "<<i<<"\n"; 963241a4b83SYohann // Get elemsize, emode, ncomp 964241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 965241a4b83SYohann CeedChk(ierr); 966241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 967241a4b83SYohann CeedChk(ierr); 968241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 969241a4b83SYohann CeedChk(ierr); 9704d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 971241a4b83SYohann CeedChk(ierr); 972241a4b83SYohann // Basis action 973241a4b83SYohann switch (emode) { 974241a4b83SYohann case CEED_EVAL_NONE: 9758795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 976241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 9778795c945Sjeremylt code << " const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n"; 978241a4b83SYohann break; 979241a4b83SYohann case CEED_EVAL_INTERP: 980241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 981241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 9828795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 983241a4b83SYohann code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 984241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 9858795c945Sjeremylt code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 986241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 987241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 988241a4b83SYohann code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 989241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 990241a4b83SYohann break; 991241a4b83SYohann case CEED_EVAL_GRAD: 992241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 9938795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 994241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 9958795c945Sjeremylt code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 996241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 997241a4b83SYohann code << "const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 998241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 999241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 1000241a4b83SYohann code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 1001241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 1002ac421f39SYohann if (basis_data->d_collograd1d) { 1003ac421f39SYohann data->G.in[i] = basis_data->d_collograd1d; 1004ac421f39SYohann code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1005ac421f39SYohann code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 1006ac421f39SYohann } else { 1007ac421f39SYohann data->G.in[i] = basis_data->d_grad1d; 1008ac421f39SYohann code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 1009241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 1010ac421f39SYohann } 1011241a4b83SYohann break; 1012241a4b83SYohann case CEED_EVAL_WEIGHT: 1013241a4b83SYohann break; // No action 1014241a4b83SYohann case CEED_EVAL_DIV: 1015241a4b83SYohann break; // TODO: Not implemented 1016241a4b83SYohann case CEED_EVAL_CURL: 1017241a4b83SYohann break; // TODO: Not implemented 1018241a4b83SYohann } 1019241a4b83SYohann } 1020241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1021241a4b83SYohann code << "// Output field "<<i<<"\n"; 1022241a4b83SYohann // Get elemsize, emode, ncomp 1023241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1024241a4b83SYohann CeedChk(ierr); 1025241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1026241a4b83SYohann CeedChk(ierr); 1027241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1028241a4b83SYohann CeedChk(ierr); 10294d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1030241a4b83SYohann CeedChk(ierr); 1031241a4b83SYohann // Basis action 1032241a4b83SYohann switch (emode) { 1033241a4b83SYohann case CEED_EVAL_NONE: 1034241a4b83SYohann code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 10358795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 10368795c945Sjeremylt code << " const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n"; 1037241a4b83SYohann break; // No action 1038241a4b83SYohann case CEED_EVAL_INTERP: 1039241a4b83SYohann code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 1040241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 1041241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 1042241a4b83SYohann code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 1043241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1044241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 1045241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1046241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 10478795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 10488795c945Sjeremylt code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 1049241a4b83SYohann break; 1050241a4b83SYohann case CEED_EVAL_GRAD: 1051241a4b83SYohann code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 1052241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 1053241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 1054241a4b83SYohann code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 1055241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1056241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 1057241a4b83SYohann code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1058241a4b83SYohann code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1059ac421f39SYohann if (basis_data->d_collograd1d) { 1060ac421f39SYohann data->G.out[i] = basis_data->d_collograd1d; 1061ac421f39SYohann code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1062ac421f39SYohann code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1063ac421f39SYohann } else { 1064ac421f39SYohann data->G.out[i] = basis_data->d_grad1d; 1065ac421f39SYohann code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1066241a4b83SYohann code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1067ac421f39SYohann } 10688795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 10698795c945Sjeremylt code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 1070241a4b83SYohann break; 1071241a4b83SYohann case CEED_EVAL_WEIGHT: { 1072241a4b83SYohann Ceed ceed; 1073241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1074241a4b83SYohann return CeedError(ceed, 1, 1075241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1076241a4b83SYohann break; // Should not occur 1077241a4b83SYohann } 1078241a4b83SYohann case CEED_EVAL_DIV: 1079241a4b83SYohann break; // TODO: Not implemented 1080241a4b83SYohann case CEED_EVAL_CURL: 1081241a4b83SYohann break; // TODO: Not implemented 1082241a4b83SYohann } 1083241a4b83SYohann } 1084ac421f39SYohann code << "\n"; 1085ac421f39SYohann code << "__syncthreads();\n"; 1086241a4b83SYohann code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1087241a4b83SYohann // Input basis apply if needed 1088ac421f39SYohann // Generate the correct eval mode code for each input 1089241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1090241a4b83SYohann code << " // Input field "<<i<<"\n"; 1091241a4b83SYohann // Get elemsize, emode, ncomp 1092241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1093241a4b83SYohann CeedChk(ierr); 1094241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1095241a4b83SYohann CeedChk(ierr); 1096241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1097241a4b83SYohann CeedChk(ierr); 10984d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1099241a4b83SYohann CeedChk(ierr); 1100241a4b83SYohann // Basis action 1101241a4b83SYohann switch (emode) { 1102241a4b83SYohann case CEED_EVAL_NONE: 1103ac421f39SYohann if (!basis_data->d_collograd1d) { 1104241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 110561dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 110661dbc9d2Sjeremylt code << " readQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n"; 1107ac421f39SYohann } 1108241a4b83SYohann break; 1109241a4b83SYohann case CEED_EVAL_INTERP: 1110241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1111241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1112241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 1113*ccedf6b0Sjeremylt if (data->indicies.in[i]) { 1114*ccedf6b0Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1115*ccedf6b0Sjeremylt } else { 1116*ccedf6b0Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.in[i])); CeedChk(ierr); 1117*ccedf6b0Sjeremylt } 1118*ccedf6b0Sjeremylt code << " readDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, strides.in["<<i<<"], indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1119241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1120241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1121241a4b83SYohann break; 1122241a4b83SYohann case CEED_EVAL_GRAD: 1123241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1124241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1125241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 1126*ccedf6b0Sjeremylt if (data->indicies.in[i]) { 1127*ccedf6b0Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1128*ccedf6b0Sjeremylt } else { 1129*ccedf6b0Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.in[i])); CeedChk(ierr); 1130*ccedf6b0Sjeremylt } 1131*ccedf6b0Sjeremylt code << " readDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, strides.in["<<i<<"], indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1132ac421f39SYohann if (basis_data->d_collograd1d) { 1133ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1134ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1135ac421f39SYohann } else { 1136241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1137241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1138ac421f39SYohann } 1139241a4b83SYohann break; 1140241a4b83SYohann case CEED_EVAL_WEIGHT: 1141241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1142241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1143241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1144241a4b83SYohann data->W = basis_data->d_qweight1d; 1145241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1146241a4b83SYohann break; // No action 1147241a4b83SYohann case CEED_EVAL_DIV: 1148241a4b83SYohann break; // TODO: Not implemented 1149241a4b83SYohann case CEED_EVAL_CURL: 1150241a4b83SYohann break; // TODO: Not implemented 1151241a4b83SYohann } 1152241a4b83SYohann } 1153ac421f39SYohann 1154241a4b83SYohann // Q function 1155241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1156ac421f39SYohann code << " // Output field "<<i<<"\n"; 1157241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1158241a4b83SYohann CeedChk(ierr); 1159241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1160241a4b83SYohann { 1161ac421f39SYohann if (basis_data->d_collograd1d) { 1162ac421f39SYohann //Accumulator for gradient slices 1163ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1164ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1165ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1166ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1167ac421f39SYohann code << " }\n"; 1168ac421f39SYohann code << " }\n"; 1169ac421f39SYohann } else { 1170241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1171241a4b83SYohann } 1172ac421f39SYohann } 1173241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1174241a4b83SYohann { 1175241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1176241a4b83SYohann } 1177241a4b83SYohann } 1178ac421f39SYohann //We treat quadrature points per slice in 3d to save registers 1179ac421f39SYohann code << "// QFunction\n"; 1180ac421f39SYohann if (basis_data->d_collograd1d) { 1181ac421f39SYohann code << "#pragma unroll\n"; 1182ac421f39SYohann code << "for (CeedInt q=0; q<Q1d; q++) {\n"; 1183ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1184ac421f39SYohann code << " // Input field "<<i<<"\n"; 1185ac421f39SYohann // Get elemsize, emode, ncomp 1186ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1187ac421f39SYohann CeedChk(ierr); 1188ac421f39SYohann // Basis action 1189ac421f39SYohann switch (emode) { 1190ac421f39SYohann case CEED_EVAL_NONE: 1191ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 119261dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 119361dbc9d2Sjeremylt code << " readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1194ac421f39SYohann break; 1195ac421f39SYohann case CEED_EVAL_INTERP: 1196ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1197ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1198ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1199ac421f39SYohann code << " }\n"; 1200ac421f39SYohann break; 1201ac421f39SYohann case CEED_EVAL_GRAD: 1202ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1203ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1204ac421f39SYohann break; 1205ac421f39SYohann case CEED_EVAL_WEIGHT: 1206ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1207ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1208ac421f39SYohann break; // No action 1209ac421f39SYohann case CEED_EVAL_DIV: 1210ac421f39SYohann break; // TODO: Not implemented 1211ac421f39SYohann case CEED_EVAL_CURL: 1212ac421f39SYohann break; // TODO: Not implemented 1213ac421f39SYohann } 1214ac421f39SYohann } 1215ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1216ac421f39SYohann code << " // Output field "<<i<<"\n"; 1217ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1218ac421f39SYohann CeedChk(ierr); 1219ac421f39SYohann // Basis action 1220ac421f39SYohann switch (emode) { 1221ac421f39SYohann case CEED_EVAL_NONE: 1222ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1223ac421f39SYohann break; // No action 1224ac421f39SYohann case CEED_EVAL_INTERP: 1225ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1226ac421f39SYohann break; 1227ac421f39SYohann case CEED_EVAL_GRAD: 1228ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1229ac421f39SYohann break; 1230ac421f39SYohann case CEED_EVAL_WEIGHT: 1231ac421f39SYohann break; // Should not occur 1232ac421f39SYohann case CEED_EVAL_DIV: 1233ac421f39SYohann break; // TODO: Not implemented 1234ac421f39SYohann case CEED_EVAL_CURL: 1235ac421f39SYohann break; // TODO: Not implemented 1236ac421f39SYohann } 1237ac421f39SYohann } 1238ac421f39SYohann } else { 1239ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1240ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1241ac421f39SYohann } 1242ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1243ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1244ac421f39SYohann } 1245ac421f39SYohann } 12464d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 12474d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1248ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 12494d537eeaSYohann } 12504d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 12514d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1252ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 12534d537eeaSYohann } 1254241a4b83SYohann string qFunctionName(qf_data->qFunctionName); 1255ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 1256ac421f39SYohann if (dim != 3 || basis_data->d_collograd1d) { 1257ac421f39SYohann code << "1 "; 1258ac421f39SYohann }else{ 1259ac421f39SYohann code << "Q1d "; 1260ac421f39SYohann } 1261ac421f39SYohann code << ", in, out);\n"; 1262ac421f39SYohann if (basis_data->d_collograd1d) { 1263ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1264ac421f39SYohann code << " // Output field "<<i<<"\n"; 1265ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1266ac421f39SYohann CeedChk(ierr); 1267ac421f39SYohann // Basis action 1268ac421f39SYohann switch (emode) { 1269ac421f39SYohann case CEED_EVAL_NONE: 1270ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1271ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1272ac421f39SYohann code << " }\n"; 1273ac421f39SYohann break; // No action 1274ac421f39SYohann case CEED_EVAL_INTERP: 1275ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1276ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1277ac421f39SYohann code << " }\n"; 1278ac421f39SYohann break; 1279ac421f39SYohann case CEED_EVAL_GRAD: 1280ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1281ac421f39SYohann break; 1282ac421f39SYohann case CEED_EVAL_WEIGHT: 1283ac421f39SYohann break; // Should not occur 1284ac421f39SYohann case CEED_EVAL_DIV: 1285ac421f39SYohann break; // TODO: Not implemented 1286ac421f39SYohann case CEED_EVAL_CURL: 1287ac421f39SYohann break; // TODO: Not implemented 1288ac421f39SYohann } 1289ac421f39SYohann } 1290ac421f39SYohann code << "}\n"; 1291ac421f39SYohann } 1292241a4b83SYohann 1293241a4b83SYohann // Output basis apply if needed 1294ac421f39SYohann // Generate the correct eval mode code for each output 1295241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1296241a4b83SYohann code << " // Output field "<<i<<"\n"; 1297241a4b83SYohann // Get elemsize, emode, ncomp 1298241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1299241a4b83SYohann CeedChk(ierr); 1300241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1301241a4b83SYohann CeedChk(ierr); 1302241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1303241a4b83SYohann CeedChk(ierr); 13044d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1305241a4b83SYohann CeedChk(ierr); 1306241a4b83SYohann // Basis action 1307241a4b83SYohann switch (emode) { 1308241a4b83SYohann case CEED_EVAL_NONE: 130961dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 131061dbc9d2Sjeremylt code << " writeQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n"; 1311241a4b83SYohann break; // No action 1312241a4b83SYohann case CEED_EVAL_INTERP: 1313241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1314241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1315241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1316241a4b83SYohann data->indices.out[i] = restr_data->d_ind; 1317*ccedf6b0Sjeremylt if (data->indicies.out[i]) { 1318*ccedf6b0Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1319*ccedf6b0Sjeremylt } else { 1320*ccedf6b0Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.out[i])); CeedChk(ierr); 1321*ccedf6b0Sjeremylt } 1322*ccedf6b0Sjeremylt code << " writeDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, strides.out["<<i<<"], indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1323241a4b83SYohann break; 1324241a4b83SYohann case CEED_EVAL_GRAD: 1325241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1326ac421f39SYohann if (basis_data->d_collograd1d) { 1327ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1328ac421f39SYohann } else { 1329241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1330ac421f39SYohann } 1331241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1332241a4b83SYohann data->indices.out[i] = restr_data->d_ind; 1333*ccedf6b0Sjeremylt if (data->indicies.out[i]) { 1334*ccedf6b0Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1335*ccedf6b0Sjeremylt } else { 1336*ccedf6b0Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &(data->strides.out[i])); CeedChk(ierr); 1337*ccedf6b0Sjeremylt } 1338*ccedf6b0Sjeremylt code << " writeDofs"<<(restr_data->d_ind?(imode==CEED_NONINTERLACED?"":"Transpose"):"Strided")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, strides.out["<<i<<"], indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1339241a4b83SYohann break; 1340241a4b83SYohann case CEED_EVAL_WEIGHT: { 1341241a4b83SYohann Ceed ceed; 1342241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1343241a4b83SYohann return CeedError(ceed, 1, 1344241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1345241a4b83SYohann break; // Should not occur 1346241a4b83SYohann } 1347241a4b83SYohann case CEED_EVAL_DIV: 1348241a4b83SYohann break; // TODO: Not implemented 1349241a4b83SYohann case CEED_EVAL_CURL: 1350241a4b83SYohann break; // TODO: Not implemented 1351241a4b83SYohann } 1352241a4b83SYohann } 1353241a4b83SYohann 1354241a4b83SYohann code << " }\n"; 1355241a4b83SYohann code << "}\n\n"; 1356241a4b83SYohann 1357241a4b83SYohann // std::cout << code.str(); 1358241a4b83SYohann 1359241a4b83SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr); 1360241a4b83SYohann ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1361241a4b83SYohann CeedChk(ierr); 1362241a4b83SYohann 1363241a4b83SYohann ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1364241a4b83SYohann 1365241a4b83SYohann return 0; 1366241a4b83SYohann } 1367