1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16241a4b83SYohann #include <ceed-backend.h> 17241a4b83SYohann #include "ceed-cuda-gen.h" 18241a4b83SYohann #include <iostream> 19241a4b83SYohann #include <sstream> 20241a4b83SYohann #include "../cuda/ceed-cuda.h" 21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h" 22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 23241a4b83SYohann 24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 25241a4b83SYohann __device__ double atomicAdd(double *address, double val) { 26241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 27241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 28241a4b83SYohann do { 29241a4b83SYohann assumed = old; 30241a4b83SYohann old = 31241a4b83SYohann atomicCAS(address_as_ull, assumed, 32241a4b83SYohann __double_as_longlong(val + 33241a4b83SYohann __longlong_as_double(assumed))); 34241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 35241a4b83SYohann // (since NaN != NaN) 36241a4b83SYohann } while (assumed != old); 37241a4b83SYohann return __longlong_as_double(old); 38241a4b83SYohann } 39f1a13f77SYohann Dudouit ); 40f1a13f77SYohann Dudouit 41f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 42f1a13f77SYohann Dudouit 43f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45f1a13f77SYohann Dudouit 46f1a13f77SYohann Dudouit typedef struct { 47f1a13f77SYohann Dudouit CeedInt tidx; 48f1a13f77SYohann Dudouit CeedInt tidy; 49f1a13f77SYohann Dudouit CeedInt tidz; 50f1a13f77SYohann Dudouit CeedInt tid; 51f1a13f77SYohann Dudouit CeedScalar* slice; 52f1a13f77SYohann Dudouit } BackendData; 53241a4b83SYohann 54241a4b83SYohann template <int P, int Q> 55241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56ac421f39SYohann for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57241a4b83SYohann B[i] = d_B[i]; 58241a4b83SYohann } 59241a4b83SYohann } 60241a4b83SYohann 61241a4b83SYohann //**** 62241a4b83SYohann // 1D 63241a4b83SYohann template <int NCOMP, int P1d> 64920dcdc4Sjeremylt inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 65241a4b83SYohann if (data.tidx<P1d) 66241a4b83SYohann { 674d537eeaSYohann const CeedInt node = data.tidx; 68ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 69241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 708795c945Sjeremylt r_u[comp] = d_u[ind + nnodes * comp]; 71241a4b83SYohann } 72241a4b83SYohann } 73241a4b83SYohann } 74241a4b83SYohann 75241a4b83SYohann template <int NCOMP, int P1d> 76920dcdc4Sjeremylt inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77241a4b83SYohann if (data.tidx<P1d) 78241a4b83SYohann { 794d537eeaSYohann const CeedInt node = data.tidx; 80ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 81241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 82241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 83241a4b83SYohann } 84241a4b83SYohann } 85241a4b83SYohann } 86920dcdc4Sjeremylt 87d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 88d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 89ccedf6b0Sjeremylt if (data.tidx<P1d) 90ccedf6b0Sjeremylt { 91ccedf6b0Sjeremylt const CeedInt node = data.tidx; 92d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 93ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 94d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 95ccedf6b0Sjeremylt } 96ccedf6b0Sjeremylt } 97ccedf6b0Sjeremylt } 98241a4b83SYohann 99241a4b83SYohann template <int NCOMP, int P1d> 100920dcdc4Sjeremylt inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 101241a4b83SYohann if (data.tidx<P1d) 102241a4b83SYohann { 1034d537eeaSYohann const CeedInt node = data.tidx; 104ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 105241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 1068795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 107241a4b83SYohann } 108241a4b83SYohann } 109241a4b83SYohann } 110241a4b83SYohann 111241a4b83SYohann template <int NCOMP, int P1d> 112920dcdc4Sjeremylt inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 113241a4b83SYohann if (data.tidx<P1d) 114241a4b83SYohann { 1154d537eeaSYohann const CeedInt node = data.tidx; 116ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 117241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 118241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 119241a4b83SYohann } 120241a4b83SYohann } 121241a4b83SYohann } 122241a4b83SYohann 123d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 124d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 125ccedf6b0Sjeremylt if (data.tidx<P1d) 126ccedf6b0Sjeremylt { 127ccedf6b0Sjeremylt const CeedInt node = data.tidx; 128d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 129ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 130d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 131ccedf6b0Sjeremylt } 132ccedf6b0Sjeremylt } 133ccedf6b0Sjeremylt } 134ccedf6b0Sjeremylt 135241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 136241a4b83SYohann inline __device__ void ContractX1d(BackendData& data, 137241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 138241a4b83SYohann data.slice[data.tidx] = *U; 139241a4b83SYohann __syncthreads(); 140241a4b83SYohann *V = 0.0; 141ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 142241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 143241a4b83SYohann } 144241a4b83SYohann __syncthreads(); 145241a4b83SYohann } 146241a4b83SYohann 147241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 148241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data, 149241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 150241a4b83SYohann data.slice[data.tidx] = *U; 151241a4b83SYohann __syncthreads(); 152241a4b83SYohann *V = 0.0; 153ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 154241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 155241a4b83SYohann } 156241a4b83SYohann __syncthreads(); 157241a4b83SYohann } 158241a4b83SYohann 159241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 160241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 161241a4b83SYohann CeedScalar *__restrict__ r_V) { 162ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 163241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 164241a4b83SYohann } 165241a4b83SYohann } 166241a4b83SYohann 167241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 168241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 169241a4b83SYohann CeedScalar *__restrict__ r_V) { 170ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 171241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 172241a4b83SYohann } 173241a4b83SYohann } 174241a4b83SYohann 175241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 176241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 177241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 178241a4b83SYohann CeedScalar *__restrict__ r_V) { 179ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 180241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 181241a4b83SYohann } 182241a4b83SYohann } 183241a4b83SYohann 184241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 185241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 186241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 187241a4b83SYohann CeedScalar *__restrict__ r_V) { 188ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 189241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 190241a4b83SYohann } 191241a4b83SYohann } 192241a4b83SYohann 193241a4b83SYohann //**** 194241a4b83SYohann // 2D 195241a4b83SYohann template <int NCOMP, int P1d> 196920dcdc4Sjeremylt inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 197241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 198241a4b83SYohann { 1994d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 200ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 201241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 2028795c945Sjeremylt r_u[comp] = d_u[ind + nnodes * comp]; 203241a4b83SYohann } 204241a4b83SYohann } 205241a4b83SYohann } 206241a4b83SYohann 207241a4b83SYohann template <int NCOMP, int P1d> 208920dcdc4Sjeremylt inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 209241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 210241a4b83SYohann { 2114d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 212ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 213241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 214241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 215241a4b83SYohann } 216241a4b83SYohann } 217241a4b83SYohann } 218920dcdc4Sjeremylt 219d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 220d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 221ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) 222ccedf6b0Sjeremylt { 223ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 224d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 225ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 226d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 227ccedf6b0Sjeremylt } 228ccedf6b0Sjeremylt } 229ccedf6b0Sjeremylt } 230241a4b83SYohann 231241a4b83SYohann template <int NCOMP, int P1d> 232920dcdc4Sjeremylt inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 233241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 234241a4b83SYohann { 2354d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 236ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 237241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 2388795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 239241a4b83SYohann } 240241a4b83SYohann } 241241a4b83SYohann } 242241a4b83SYohann 243241a4b83SYohann template <int NCOMP, int P1d> 244920dcdc4Sjeremylt inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 245241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 246241a4b83SYohann { 2474d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 248ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 249241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 250241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 251241a4b83SYohann } 252241a4b83SYohann } 253241a4b83SYohann } 254241a4b83SYohann 255d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 256d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 257ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) 258ccedf6b0Sjeremylt { 259ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 260d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 261ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 262d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 263ccedf6b0Sjeremylt } 264ccedf6b0Sjeremylt } 265ccedf6b0Sjeremylt } 266ccedf6b0Sjeremylt 267241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 268241a4b83SYohann inline __device__ void ContractX2d(BackendData& data, 269241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 270241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 271241a4b83SYohann __syncthreads(); 272241a4b83SYohann *V = 0.0; 273ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 274241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 275241a4b83SYohann } 276241a4b83SYohann __syncthreads(); 277241a4b83SYohann } 278241a4b83SYohann 279241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 280241a4b83SYohann inline __device__ void ContractY2d(BackendData& data, 281241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 282241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 283241a4b83SYohann __syncthreads(); 284241a4b83SYohann *V = 0.0; 285ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 286241a4b83SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 287241a4b83SYohann } 288241a4b83SYohann __syncthreads(); 289241a4b83SYohann } 290241a4b83SYohann 291241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 292241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data, 293241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 294241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 295241a4b83SYohann __syncthreads(); 296241a4b83SYohann *V = 0.0; 297241a4b83SYohann if (data.tidy<P1d) { 298ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 299241a4b83SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 300241a4b83SYohann } 301241a4b83SYohann } 302241a4b83SYohann __syncthreads(); 303241a4b83SYohann } 304241a4b83SYohann 305241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 306241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data, 307241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 308241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 309241a4b83SYohann __syncthreads(); 310241a4b83SYohann *V = 0.0; 311241a4b83SYohann if (data.tidx<P1d) { 312ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 313241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 314241a4b83SYohann } 315241a4b83SYohann } 316241a4b83SYohann __syncthreads(); 317241a4b83SYohann } 318241a4b83SYohann 319241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 320241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data, 321241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 322241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 323241a4b83SYohann __syncthreads(); 324241a4b83SYohann if (data.tidx<P1d) { 325ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 326241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 327241a4b83SYohann } 328241a4b83SYohann } 329241a4b83SYohann __syncthreads(); 330241a4b83SYohann } 331241a4b83SYohann 332241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 333241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 334241a4b83SYohann CeedScalar *__restrict__ r_V) { 335241a4b83SYohann CeedScalar r_t[1]; 336ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 337241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 338241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 339241a4b83SYohann } 340241a4b83SYohann } 341241a4b83SYohann 342241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 343241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 344241a4b83SYohann CeedScalar *__restrict__ r_V) { 345241a4b83SYohann CeedScalar r_t[1]; 346ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 347241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 348241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 349241a4b83SYohann } 350241a4b83SYohann } 351241a4b83SYohann 352241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 353241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 354241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 355241a4b83SYohann CeedScalar *__restrict__ r_V) { 356241a4b83SYohann CeedScalar r_t[1]; 357ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 358241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 359241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 360241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 361241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 362241a4b83SYohann } 363241a4b83SYohann } 364241a4b83SYohann 365241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 366241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 367241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 368241a4b83SYohann CeedScalar *__restrict__ r_V) { 369241a4b83SYohann CeedScalar r_t[1]; 370ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 371241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 372241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 373241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 374241a4b83SYohann ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 375241a4b83SYohann } 376241a4b83SYohann } 377241a4b83SYohann 378241a4b83SYohann //**** 379241a4b83SYohann // 3D 380241a4b83SYohann template <int NCOMP, int P1d> 381920dcdc4Sjeremylt inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 382241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 383241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3844d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 385ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 386241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 3878795c945Sjeremylt r_u[z+comp*P1d] = d_u[ind + nnodes * comp]; 388241a4b83SYohann } 389241a4b83SYohann } 390241a4b83SYohann } 391241a4b83SYohann } 392241a4b83SYohann 393241a4b83SYohann template <int NCOMP, int P1d> 394920dcdc4Sjeremylt inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 395241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 396241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3974d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 398ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 399241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 400241a4b83SYohann r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 401241a4b83SYohann } 402241a4b83SYohann } 403241a4b83SYohann } 404241a4b83SYohann } 405920dcdc4Sjeremylt 406d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 407d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 408ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) { 409ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 410ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 411d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 412ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 413d80fc06aSjeremylt r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 414ccedf6b0Sjeremylt } 415ccedf6b0Sjeremylt } 416ccedf6b0Sjeremylt } 417ccedf6b0Sjeremylt } 418241a4b83SYohann 419241a4b83SYohann template <int NCOMP, int Q1d> 420920dcdc4Sjeremylt inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 421ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 422920dcdc4Sjeremylt const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 423ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 424ac421f39SYohann r_u[comp] = d_u[ind + nquads * comp]; 425ac421f39SYohann } 426ac421f39SYohann } 427ac421f39SYohann 428ac421f39SYohann template <int NCOMP, int Q1d> 429920dcdc4Sjeremylt inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 430ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 431920dcdc4Sjeremylt const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d]; 432ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 433ac421f39SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 434ac421f39SYohann } 435ac421f39SYohann } 436ac421f39SYohann 437d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 43825dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 439920dcdc4Sjeremylt const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 440d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 441920dcdc4Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 442d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 443920dcdc4Sjeremylt } 444920dcdc4Sjeremylt } 445920dcdc4Sjeremylt 446241a4b83SYohann template <int NCOMP, int P1d> 447920dcdc4Sjeremylt inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 448241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 449241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4504d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 451ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 452241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 4538795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]); 454241a4b83SYohann } 455241a4b83SYohann } 456241a4b83SYohann } 457241a4b83SYohann } 458241a4b83SYohann 459241a4b83SYohann template <int NCOMP, int P1d> 460920dcdc4Sjeremylt inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 461241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 462241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4634d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 464ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 465241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 466241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 467241a4b83SYohann } 468241a4b83SYohann } 469241a4b83SYohann } 470241a4b83SYohann } 471241a4b83SYohann 472d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4732f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 474ccedf6b0Sjeremylt if (data.tidx<P1d && data.tidy<P1d) { 475ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 476ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 477d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 478ccedf6b0Sjeremylt for (CeedInt comp = 0; comp < NCOMP; ++comp) { 479d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 480ccedf6b0Sjeremylt } 481ccedf6b0Sjeremylt } 482ccedf6b0Sjeremylt } 483ccedf6b0Sjeremylt } 484ccedf6b0Sjeremylt 485241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 486241a4b83SYohann inline __device__ void ContractX3d(BackendData& data, 487241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 488ac421f39SYohann CeedScalar r_B[P1d]; 489ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 490ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 491ac421f39SYohann } 492ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 493241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 494241a4b83SYohann __syncthreads(); 495241a4b83SYohann V[k] = 0.0; 496ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 497ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 498241a4b83SYohann } 499241a4b83SYohann __syncthreads(); 500241a4b83SYohann } 501241a4b83SYohann } 502241a4b83SYohann 503241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 504241a4b83SYohann inline __device__ void ContractY3d(BackendData& data, 505241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 506ac421f39SYohann CeedScalar r_B[P1d]; 507ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 508ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 509ac421f39SYohann } 510ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 511241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 512241a4b83SYohann __syncthreads(); 513241a4b83SYohann V[k] = 0.0; 514ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 515ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 516241a4b83SYohann } 517241a4b83SYohann __syncthreads(); 518241a4b83SYohann } 519241a4b83SYohann } 520241a4b83SYohann 521241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 522241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data, 523241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 524ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 525241a4b83SYohann V[k] = 0.0; 526ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 527241a4b83SYohann V[k] += B[i + k*P1d] * U[i];//contract z direction 528241a4b83SYohann } 529241a4b83SYohann } 530241a4b83SYohann } 531241a4b83SYohann 532241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 533241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data, 534241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 535ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 536241a4b83SYohann V[k] = 0.0; 537241a4b83SYohann if (k<P1d) { 538ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 539241a4b83SYohann V[k] += B[k + i*P1d] * U[i];//contract z direction 540241a4b83SYohann } 541241a4b83SYohann } 542241a4b83SYohann } 543241a4b83SYohann } 544241a4b83SYohann 545241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 546241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data, 547241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 548ac421f39SYohann CeedScalar r_B[Q1d]; 549ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 550ac421f39SYohann { 551ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 552ac421f39SYohann } 553ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 554241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 555241a4b83SYohann __syncthreads(); 556241a4b83SYohann V[k] = 0.0; 557241a4b83SYohann if (data.tidy<P1d) { 558ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 559ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 560ac421f39SYohann } 561ac421f39SYohann } 562ac421f39SYohann __syncthreads(); 563ac421f39SYohann } 564ac421f39SYohann } 565ac421f39SYohann 566ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 567ac421f39SYohann inline __device__ void ContractTransposeAddY3d(BackendData& data, 568ac421f39SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 569ac421f39SYohann CeedScalar r_B[Q1d]; 570ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 571ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 572ac421f39SYohann } 573ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 574ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 575ac421f39SYohann __syncthreads(); 576ac421f39SYohann if (data.tidy<P1d) { 577ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 578ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 579241a4b83SYohann } 580241a4b83SYohann } 581241a4b83SYohann __syncthreads(); 582241a4b83SYohann } 583241a4b83SYohann } 584241a4b83SYohann 585241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 586241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data, 587241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 588ac421f39SYohann CeedScalar r_B[Q1d]; 589ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 590ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 591ac421f39SYohann } 592ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 593241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 594241a4b83SYohann __syncthreads(); 595241a4b83SYohann V[k] = 0.0; 596241a4b83SYohann if (data.tidx<P1d) { 597ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 598ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 599241a4b83SYohann } 600241a4b83SYohann } 601241a4b83SYohann __syncthreads(); 602241a4b83SYohann } 603241a4b83SYohann } 604241a4b83SYohann 605241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 606241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data, 607241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 608ac421f39SYohann CeedScalar r_B[Q1d]; 609ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 610ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 611ac421f39SYohann } 612ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 613241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 614241a4b83SYohann __syncthreads(); 615241a4b83SYohann if (data.tidx<P1d) { 616ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 617ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 618241a4b83SYohann } 619241a4b83SYohann } 620241a4b83SYohann __syncthreads(); 621241a4b83SYohann } 622241a4b83SYohann } 623241a4b83SYohann 624241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 625241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 626241a4b83SYohann CeedScalar *__restrict__ r_V) { 627241a4b83SYohann CeedScalar r_t1[Q1d]; 628241a4b83SYohann CeedScalar r_t2[Q1d]; 629ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 630241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 631241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 632241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 633241a4b83SYohann } 634241a4b83SYohann } 635241a4b83SYohann 636241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 637241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 638241a4b83SYohann CeedScalar *__restrict__ r_V) { 639241a4b83SYohann CeedScalar r_t1[Q1d]; 640241a4b83SYohann CeedScalar r_t2[Q1d]; 641ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 642241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 643241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 644241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 645241a4b83SYohann } 646241a4b83SYohann } 647241a4b83SYohann 648241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 649241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 650241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 651241a4b83SYohann CeedScalar *__restrict__ r_V) { 652241a4b83SYohann CeedScalar r_t1[Q1d]; 653241a4b83SYohann CeedScalar r_t2[Q1d]; 654ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 655241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 656241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 657241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 658241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 659241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 660241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 661241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 662241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 663241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 664241a4b83SYohann } 665241a4b83SYohann } 666241a4b83SYohann 667241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 668241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 669241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 670241a4b83SYohann CeedScalar *__restrict__ r_V) { 671241a4b83SYohann CeedScalar r_t1[Q1d]; 672241a4b83SYohann CeedScalar r_t2[Q1d]; 673ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 674241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 675241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 676241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 677241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 678241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 679241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 680241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 681241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 682241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 683241a4b83SYohann } 684241a4b83SYohann } 685241a4b83SYohann 686ac421f39SYohann template <int NCOMP, int Q1d> 687ac421f39SYohann inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, 688ac421f39SYohann const CeedScalar *__restrict__ r_U, 689ac421f39SYohann const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 690ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 691ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d]; 692ac421f39SYohann __syncthreads(); 693ac421f39SYohann // X derivative 694ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 695ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 696ac421f39SYohann r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 697ac421f39SYohann } 698ac421f39SYohann // Y derivative 699ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 700ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 701ac421f39SYohann r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 702ac421f39SYohann } 703ac421f39SYohann // Z derivative 704ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 705ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 706ac421f39SYohann r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative) 707ac421f39SYohann } 708ac421f39SYohann __syncthreads(); 709ac421f39SYohann } 710ac421f39SYohann } 711ac421f39SYohann 712ac421f39SYohann template <int NCOMP, int Q1d> 713ac421f39SYohann inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, 714ac421f39SYohann const CeedScalar *__restrict__ r_U, 715ac421f39SYohann const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 716ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 717ac421f39SYohann // X derivative 718ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 719ac421f39SYohann __syncthreads(); 720ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 721ac421f39SYohann r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 722ac421f39SYohann } 723ac421f39SYohann __syncthreads(); 724ac421f39SYohann // Y derivative 725ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 726ac421f39SYohann __syncthreads(); 727ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 728ac421f39SYohann r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 729ac421f39SYohann } 730ac421f39SYohann __syncthreads(); 731ac421f39SYohann // Z derivative 732ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 733ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative) 734ac421f39SYohann } 735ac421f39SYohann } 736ac421f39SYohann } 737ac421f39SYohann 738241a4b83SYohann template <int Q1d> 739241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 740241a4b83SYohann *w = qweight1d[data.tidx]; 741241a4b83SYohann } 742241a4b83SYohann 743241a4b83SYohann template <int Q1d> 744241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 745241a4b83SYohann *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 746241a4b83SYohann } 747241a4b83SYohann 748241a4b83SYohann template <int Q1d> 749241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 750241a4b83SYohann const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 751ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 752241a4b83SYohann { 753241a4b83SYohann w[z] = pw*qweight1d[z]; 754241a4b83SYohann } 755241a4b83SYohann } 756241a4b83SYohann 757241a4b83SYohann ); 758241a4b83SYohann 759241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 760241a4b83SYohann 761241a4b83SYohann using std::ostringstream; 762241a4b83SYohann using std::string; 763241a4b83SYohann int ierr; 764241a4b83SYohann bool setupdone; 765241a4b83SYohann ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 766241a4b83SYohann if (setupdone) return 0; 767241a4b83SYohann Ceed ceed; 768241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 769241a4b83SYohann CeedOperator_Cuda_gen *data; 770241a4b83SYohann ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 771241a4b83SYohann CeedQFunction qf; 772241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 773241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 774241a4b83SYohann ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 775*1da99368SJeremy L Thompson CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 776*1da99368SJeremy L Thompson numoutputfields, ncomp, dim = 0, nnodes; 777241a4b83SYohann ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 778241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 779241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 780241a4b83SYohann CeedChk(ierr); 781241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 782241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 783241a4b83SYohann CeedChk(ierr); 784241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 785241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 786241a4b83SYohann CeedChk(ierr); 787241a4b83SYohann CeedEvalMode emode; 78861dbc9d2Sjeremylt CeedInterlaceMode imode; 789241a4b83SYohann CeedBasis basis; 790241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 791241a4b83SYohann CeedElemRestriction Erestrict; 792241a4b83SYohann CeedElemRestriction_Cuda_reg *restr_data; 793241a4b83SYohann 794241a4b83SYohann ostringstream code; 795241a4b83SYohann string devFunctions(deviceFunctions); 796241a4b83SYohann 797f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 798f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 799f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 800abfaacbbSSander Arens ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr); 801f1a13f77SYohann Dudouit ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 802f1a13f77SYohann Dudouit if (prop.major<6){ 803f1a13f77SYohann Dudouit code << atomicAdd; 804f1a13f77SYohann Dudouit } 805f1a13f77SYohann Dudouit 806241a4b83SYohann code << devFunctions; 807241a4b83SYohann 808241a4b83SYohann string qFunction(qf_data->qFunctionSource); 8094d537eeaSYohann 8104d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 811ee07ded2SValeria Barra code << "\n#define CeedPragmaSIMD\n"; 812*1da99368SJeremy L Thompson 813*1da99368SJeremy L Thompson // Find dim and Q1d 814*1da99368SJeremy L Thompson bool collograd = false; 815*1da99368SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 816*1da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 817*1da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 818*1da99368SJeremy L Thompson ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 819*1da99368SJeremy L Thompson 820*1da99368SJeremy L Thompson // Check for collocated gradient 821*1da99368SJeremy L Thompson if (basis_data->d_collograd1d) 822*1da99368SJeremy L Thompson collograd = true; 823*1da99368SJeremy L Thompson 824*1da99368SJeremy L Thompson // Collect dim and Q1d 825*1da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 826*1da99368SJeremy L Thompson bool isTensor; 827*1da99368SJeremy L Thompson ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 828*1da99368SJeremy L Thompson if (isTensor) { 829*1da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 830*1da99368SJeremy L Thompson } else { 831*1da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 832*1da99368SJeremy L Thompson } 833*1da99368SJeremy L Thompson } 834*1da99368SJeremy L Thompson } 835*1da99368SJeremy L Thompson // Check output bases for Q1d, dim as well 836*1da99368SJeremy L Thompson // The only imput basis might be CEED_BASIS_COLLOCATED 837*1da99368SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 838*1da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 839*1da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 840*1da99368SJeremy L Thompson ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 841*1da99368SJeremy L Thompson // Collect dim and Q1d 842*1da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 843*1da99368SJeremy L Thompson bool isTensor; 844*1da99368SJeremy L Thompson ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 845*1da99368SJeremy L Thompson if (isTensor) { 846*1da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 847*1da99368SJeremy L Thompson } else { 848*1da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 849*1da99368SJeremy L Thompson } 850*1da99368SJeremy L Thompson } 851*1da99368SJeremy L Thompson } 852*1da99368SJeremy L Thompson data->dim = dim; 853*1da99368SJeremy L Thompson data->Q1d = Q1d; 854*1da99368SJeremy L Thompson 855*1da99368SJeremy L Thompson // Define CEED_Q_VLA 856*1da99368SJeremy L Thompson if (dim != 3 || collograd) { 857*1da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA 1\n\n"; 858*1da99368SJeremy L Thompson } else { 859*1da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 860*1da99368SJeremy L Thompson } 861*1da99368SJeremy L Thompson 862241a4b83SYohann code << qFunction; 863241a4b83SYohann 864241a4b83SYohann // Setup 865d80fc06aSjeremylt code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 866241a4b83SYohann // Input Evecs and Restriction 867241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 868241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 869241a4b83SYohann CeedChk(ierr); 870*1da99368SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 871241a4b83SYohann code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 872241a4b83SYohann } 873241a4b83SYohann } 874241a4b83SYohann 875241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 876241a4b83SYohann code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 877241a4b83SYohann } 878*1da99368SJeremy L Thompson 879241a4b83SYohann code << "const CeedInt Dim = "<<dim<<";\n"; 880241a4b83SYohann code << "const CeedInt Q1d = "<<Q1d<<";\n"; 881*1da99368SJeremy L Thompson 882241a4b83SYohann code << "extern __shared__ CeedScalar slice[];\n"; 883241a4b83SYohann code << "BackendData data;\n"; 884241a4b83SYohann code << "data.tidx = threadIdx.x;\n"; 885241a4b83SYohann code << "data.tidy = threadIdx.y;\n"; 886241a4b83SYohann code << "data.tidz = threadIdx.z;\n"; 887241a4b83SYohann code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 888241a4b83SYohann code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 889920dcdc4Sjeremylt 890920dcdc4Sjeremylt code << "\n// Input field constants and basis data\n"; 891ac421f39SYohann //Initialize constants, and matrices B and G 892241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 893920dcdc4Sjeremylt code << "// -- Input field "<<i<<" --\n"; 894241a4b83SYohann // Get elemsize, emode, ncomp 895241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 896241a4b83SYohann CeedChk(ierr); 897241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 898241a4b83SYohann CeedChk(ierr); 899241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 900241a4b83SYohann CeedChk(ierr); 9014d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 902241a4b83SYohann CeedChk(ierr); 903920dcdc4Sjeremylt 904920dcdc4Sjeremylt // Set field constants 905920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT) { 906920dcdc4Sjeremylt ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 907920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 908920dcdc4Sjeremylt ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 909920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 910920dcdc4Sjeremylt } else { 911920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 912920dcdc4Sjeremylt } 913920dcdc4Sjeremylt code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 914920dcdc4Sjeremylt } 915920dcdc4Sjeremylt 916920dcdc4Sjeremylt // Load basis data 917920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 918241a4b83SYohann switch (emode) { 919241a4b83SYohann case CEED_EVAL_NONE: 920241a4b83SYohann break; 921241a4b83SYohann case CEED_EVAL_INTERP: 922241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 923241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 924241a4b83SYohann code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 925241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 926241a4b83SYohann break; 927241a4b83SYohann case CEED_EVAL_GRAD: 928241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 929241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 930241a4b83SYohann code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 931241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 932ac421f39SYohann if (basis_data->d_collograd1d) { 933ac421f39SYohann data->G.in[i] = basis_data->d_collograd1d; 934ac421f39SYohann code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 935ac421f39SYohann code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 936ac421f39SYohann } else { 937ac421f39SYohann data->G.in[i] = basis_data->d_grad1d; 938ac421f39SYohann code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 939241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 940ac421f39SYohann } 941241a4b83SYohann break; 942241a4b83SYohann case CEED_EVAL_WEIGHT: 943241a4b83SYohann break; // No action 944241a4b83SYohann case CEED_EVAL_DIV: 945241a4b83SYohann break; // TODO: Not implemented 946241a4b83SYohann case CEED_EVAL_CURL: 947241a4b83SYohann break; // TODO: Not implemented 948241a4b83SYohann } 949241a4b83SYohann } 950920dcdc4Sjeremylt 951920dcdc4Sjeremylt code << "\n// Output field constants and basis data\n"; 952241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 953920dcdc4Sjeremylt code << "// -- Output field "<<i<<" --\n"; 954241a4b83SYohann // Get elemsize, emode, ncomp 955241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 956241a4b83SYohann CeedChk(ierr); 957241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 958241a4b83SYohann CeedChk(ierr); 959241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 960241a4b83SYohann CeedChk(ierr); 9614d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 962241a4b83SYohann CeedChk(ierr); 963920dcdc4Sjeremylt 964920dcdc4Sjeremylt // Set field constants 965241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 966920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 967241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 968241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 969920dcdc4Sjeremylt } else { 970920dcdc4Sjeremylt code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 971920dcdc4Sjeremylt } 972920dcdc4Sjeremylt code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 973920dcdc4Sjeremylt 974920dcdc4Sjeremylt // Load basis data 975920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 976920dcdc4Sjeremylt switch (emode) { 977920dcdc4Sjeremylt case CEED_EVAL_NONE: 978920dcdc4Sjeremylt break; // No action 979920dcdc4Sjeremylt case CEED_EVAL_INTERP: 980241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 981241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 982241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 983241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 984241a4b83SYohann break; 985241a4b83SYohann case CEED_EVAL_GRAD: 986241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 987241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 988241a4b83SYohann code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 989241a4b83SYohann code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 990ac421f39SYohann if (basis_data->d_collograd1d) { 991ac421f39SYohann data->G.out[i] = basis_data->d_collograd1d; 992ac421f39SYohann code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 993ac421f39SYohann code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 994ac421f39SYohann } else { 995ac421f39SYohann data->G.out[i] = basis_data->d_grad1d; 996ac421f39SYohann code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 997241a4b83SYohann code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 998ac421f39SYohann } 999241a4b83SYohann break; 1000241a4b83SYohann case CEED_EVAL_WEIGHT: { 1001241a4b83SYohann Ceed ceed; 1002241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1003241a4b83SYohann return CeedError(ceed, 1, 1004241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1005241a4b83SYohann break; // Should not occur 1006241a4b83SYohann } 1007241a4b83SYohann case CEED_EVAL_DIV: 1008241a4b83SYohann break; // TODO: Not implemented 1009241a4b83SYohann case CEED_EVAL_CURL: 1010241a4b83SYohann break; // TODO: Not implemented 1011241a4b83SYohann } 1012241a4b83SYohann } 1013ac421f39SYohann code << "\n"; 1014ac421f39SYohann code << "__syncthreads();\n"; 1015241a4b83SYohann code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1016241a4b83SYohann // Input basis apply if needed 1017ac421f39SYohann // Generate the correct eval mode code for each input 1018920dcdc4Sjeremylt code << "\n// Input field restrictions and basis actions\n"; 1019241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1020920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 1021241a4b83SYohann // Get elemsize, emode, ncomp 1022241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1023241a4b83SYohann CeedChk(ierr); 1024241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1025241a4b83SYohann CeedChk(ierr); 1026241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1027241a4b83SYohann CeedChk(ierr); 10284d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1029241a4b83SYohann CeedChk(ierr); 1030920dcdc4Sjeremylt 1031920dcdc4Sjeremylt // Restriction 1032920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT && 1033920dcdc4Sjeremylt !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) { 1034241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1035241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1036241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 1037f2b2a896Sjeremylt if (data->indices.in[i]) { 10383d05795bSjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 10393d05795bSjeremylt code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 1040ccedf6b0Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1041920dcdc4Sjeremylt code << " // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n"; 1042920dcdc4Sjeremylt code << " readDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1043ccedf6b0Sjeremylt } else { 1044920dcdc4Sjeremylt bool backendstrides; 1045920dcdc4Sjeremylt ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1046920dcdc4Sjeremylt &backendstrides); 1047920dcdc4Sjeremylt CeedChk(ierr); 1048920dcdc4Sjeremylt CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1049920dcdc4Sjeremylt if (!backendstrides) { 1050920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1051920dcdc4Sjeremylt CeedChk(ierr); 1052ccedf6b0Sjeremylt } 1053920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1054d80fc06aSjeremylt code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1055920dcdc4Sjeremylt } 1056920dcdc4Sjeremylt } 1057920dcdc4Sjeremylt 1058920dcdc4Sjeremylt // Basis action 1059920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1060920dcdc4Sjeremylt switch (emode) { 1061920dcdc4Sjeremylt case CEED_EVAL_NONE: 1062920dcdc4Sjeremylt if (!basis_data->d_collograd1d) { 1063920dcdc4Sjeremylt code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1064920dcdc4Sjeremylt } 1065920dcdc4Sjeremylt break; 1066920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1067241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1068241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1069241a4b83SYohann break; 1070241a4b83SYohann case CEED_EVAL_GRAD: 1071ac421f39SYohann if (basis_data->d_collograd1d) { 1072ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1073ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1074ac421f39SYohann } else { 1075241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1076241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1077ac421f39SYohann } 1078241a4b83SYohann break; 1079241a4b83SYohann case CEED_EVAL_WEIGHT: 1080241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1081241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1082241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1083241a4b83SYohann data->W = basis_data->d_qweight1d; 1084241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1085241a4b83SYohann break; // No action 1086241a4b83SYohann case CEED_EVAL_DIV: 1087241a4b83SYohann break; // TODO: Not implemented 1088241a4b83SYohann case CEED_EVAL_CURL: 1089241a4b83SYohann break; // TODO: Not implemented 1090241a4b83SYohann } 1091241a4b83SYohann } 1092ac421f39SYohann 1093241a4b83SYohann // Q function 1094920dcdc4Sjeremylt code << "\n// Output field setup\n"; 1095241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1096920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1097241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1098241a4b83SYohann CeedChk(ierr); 1099241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1100241a4b83SYohann { 1101ac421f39SYohann if (basis_data->d_collograd1d) { 1102ac421f39SYohann //Accumulator for gradient slices 1103ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1104ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1105ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1106ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1107ac421f39SYohann code << " }\n"; 1108ac421f39SYohann code << " }\n"; 1109ac421f39SYohann } else { 1110241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1111241a4b83SYohann } 1112ac421f39SYohann } 1113241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1114241a4b83SYohann { 1115241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1116241a4b83SYohann } 1117241a4b83SYohann } 1118ac421f39SYohann //We treat quadrature points per slice in 3d to save registers 1119ac421f39SYohann if (basis_data->d_collograd1d) { 1120920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1121ac421f39SYohann code << "#pragma unroll\n"; 1122ac421f39SYohann code << "for (CeedInt q=0; q<Q1d; q++) {\n"; 1123ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1124920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 1125ac421f39SYohann // Get elemsize, emode, ncomp 1126ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1127ac421f39SYohann CeedChk(ierr); 1128ac421f39SYohann // Basis action 1129920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1130ac421f39SYohann switch (emode) { 1131ac421f39SYohann case CEED_EVAL_NONE: 1132ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1133920dcdc4Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1134920dcdc4Sjeremylt data->indices.in[i] = restr_data->d_ind; 1135920dcdc4Sjeremylt if (data->indices.in[i]) { 1136*1da99368SJeremy L Thompson ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 1137*1da99368SJeremy L Thompson code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 113861dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1139920dcdc4Sjeremylt code << " // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n"; 1140*1da99368SJeremy L Thompson code << " readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nnodes_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1141920dcdc4Sjeremylt } else { 1142920dcdc4Sjeremylt bool backendstrides; 1143920dcdc4Sjeremylt ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1144920dcdc4Sjeremylt &backendstrides); 1145920dcdc4Sjeremylt CeedChk(ierr); 1146920dcdc4Sjeremylt CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1147920dcdc4Sjeremylt if (!backendstrides) { 1148920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1149920dcdc4Sjeremylt CeedChk(ierr); 1150920dcdc4Sjeremylt } 1151920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1152d80fc06aSjeremylt code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1153920dcdc4Sjeremylt } 1154ac421f39SYohann break; 1155ac421f39SYohann case CEED_EVAL_INTERP: 1156ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1157ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1158ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1159ac421f39SYohann code << " }\n"; 1160ac421f39SYohann break; 1161ac421f39SYohann case CEED_EVAL_GRAD: 1162ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1163ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1164ac421f39SYohann break; 1165ac421f39SYohann case CEED_EVAL_WEIGHT: 1166ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1167ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1168ac421f39SYohann break; // No action 1169ac421f39SYohann case CEED_EVAL_DIV: 1170ac421f39SYohann break; // TODO: Not implemented 1171ac421f39SYohann case CEED_EVAL_CURL: 1172ac421f39SYohann break; // TODO: Not implemented 1173ac421f39SYohann } 1174ac421f39SYohann } 1175ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1176920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1177ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1178ac421f39SYohann CeedChk(ierr); 1179ac421f39SYohann // Basis action 1180ac421f39SYohann switch (emode) { 1181ac421f39SYohann case CEED_EVAL_NONE: 1182ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1183ac421f39SYohann break; // No action 1184ac421f39SYohann case CEED_EVAL_INTERP: 1185ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1186ac421f39SYohann break; 1187ac421f39SYohann case CEED_EVAL_GRAD: 1188ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1189ac421f39SYohann break; 1190ac421f39SYohann case CEED_EVAL_WEIGHT: 1191ac421f39SYohann break; // Should not occur 1192ac421f39SYohann case CEED_EVAL_DIV: 1193ac421f39SYohann break; // TODO: Not implemented 1194ac421f39SYohann case CEED_EVAL_CURL: 1195ac421f39SYohann break; // TODO: Not implemented 1196ac421f39SYohann } 1197ac421f39SYohann } 1198ac421f39SYohann } else { 1199920dcdc4Sjeremylt code << "\n // Note: No Collocated Gradient\n"; 1200ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1201920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 1202ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1203ac421f39SYohann } 1204ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1205920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1206ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1207ac421f39SYohann } 1208ac421f39SYohann } 1209920dcdc4Sjeremylt code << " // QFunction Inputs and outputs\n"; 12104d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 12114d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1212920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 1213ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 12144d537eeaSYohann } 12154d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 12164d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1217920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1218ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 12194d537eeaSYohann } 1220920dcdc4Sjeremylt code << "\n // Apply QFunction\n"; 1221241a4b83SYohann string qFunctionName(qf_data->qFunctionName); 1222ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 1223ac421f39SYohann if (dim != 3 || basis_data->d_collograd1d) { 1224ac421f39SYohann code << "1 "; 1225ac421f39SYohann } else { 1226ac421f39SYohann code << "Q1d "; 1227ac421f39SYohann } 1228ac421f39SYohann code << ", in, out);\n"; 1229ac421f39SYohann if (basis_data->d_collograd1d) { 1230920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1231ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1232920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1233ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1234ac421f39SYohann CeedChk(ierr); 1235ac421f39SYohann // Basis action 1236920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1237ac421f39SYohann switch (emode) { 1238ac421f39SYohann case CEED_EVAL_NONE: 1239ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1240ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1241ac421f39SYohann code << " }\n"; 1242ac421f39SYohann break; // No action 1243ac421f39SYohann case CEED_EVAL_INTERP: 1244ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1245ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1246ac421f39SYohann code << " }\n"; 1247ac421f39SYohann break; 1248ac421f39SYohann case CEED_EVAL_GRAD: 1249ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1250ac421f39SYohann break; 1251ac421f39SYohann case CEED_EVAL_WEIGHT: 1252ac421f39SYohann break; // Should not occur 1253ac421f39SYohann case CEED_EVAL_DIV: 1254ac421f39SYohann break; // TODO: Not implemented 1255ac421f39SYohann case CEED_EVAL_CURL: 1256ac421f39SYohann break; // TODO: Not implemented 1257ac421f39SYohann } 1258ac421f39SYohann } 1259ac421f39SYohann code << "}\n"; 1260ac421f39SYohann } 1261241a4b83SYohann 1262241a4b83SYohann // Output basis apply if needed 1263ac421f39SYohann // Generate the correct eval mode code for each output 1264920dcdc4Sjeremylt code << "\n// Output field basis action and restrictions\n"; 1265241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1266920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1267241a4b83SYohann // Get elemsize, emode, ncomp 1268241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1269241a4b83SYohann CeedChk(ierr); 1270241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1271241a4b83SYohann CeedChk(ierr); 1272241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1273241a4b83SYohann CeedChk(ierr); 12744d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1275241a4b83SYohann CeedChk(ierr); 1276241a4b83SYohann // Basis action 1277920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1278241a4b83SYohann switch (emode) { 1279241a4b83SYohann case CEED_EVAL_NONE: 1280920dcdc4Sjeremylt code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1281241a4b83SYohann break; // No action 1282241a4b83SYohann case CEED_EVAL_INTERP: 1283241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1284241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1285241a4b83SYohann break; 1286241a4b83SYohann case CEED_EVAL_GRAD: 1287241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1288ac421f39SYohann if (basis_data->d_collograd1d) { 1289ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1290ac421f39SYohann } else { 1291241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1292ac421f39SYohann } 1293241a4b83SYohann break; 1294241a4b83SYohann case CEED_EVAL_WEIGHT: { 1295241a4b83SYohann Ceed ceed; 1296241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1297241a4b83SYohann return CeedError(ceed, 1, 1298241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1299241a4b83SYohann break; // Should not occur 1300241a4b83SYohann } 1301241a4b83SYohann case CEED_EVAL_DIV: 1302241a4b83SYohann break; // TODO: Not implemented 1303241a4b83SYohann case CEED_EVAL_CURL: 1304241a4b83SYohann break; // TODO: Not implemented 1305241a4b83SYohann } 1306920dcdc4Sjeremylt // Restriction 1307920dcdc4Sjeremylt ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1308920dcdc4Sjeremylt data->indices.out[i] = restr_data->d_ind; 1309920dcdc4Sjeremylt if (data->indices.out[i]) { 13103d05795bSjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 13113d05795bSjeremylt code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 1312920dcdc4Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1313920dcdc4Sjeremylt code << " // InterlaceMode: "<<CeedInterlaceModes[imode]<<"\n"; 1314920dcdc4Sjeremylt code << " writeDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1315920dcdc4Sjeremylt } else { 1316920dcdc4Sjeremylt bool backendstrides; 1317920dcdc4Sjeremylt ierr = CeedElemRestrictionGetBackendStridesStatus(Erestrict, 1318920dcdc4Sjeremylt &backendstrides); 1319920dcdc4Sjeremylt CeedChk(ierr); 1320920dcdc4Sjeremylt CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1321920dcdc4Sjeremylt if (!backendstrides) { 1322920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1323920dcdc4Sjeremylt CeedChk(ierr); 1324920dcdc4Sjeremylt } 1325920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1326d80fc06aSjeremylt code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1327920dcdc4Sjeremylt } 1328241a4b83SYohann } 1329241a4b83SYohann 1330241a4b83SYohann code << " }\n"; 1331241a4b83SYohann code << "}\n\n"; 1332241a4b83SYohann 1333241a4b83SYohann // std::cout << code.str(); 1334241a4b83SYohann 1335920dcdc4Sjeremylt ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); 1336920dcdc4Sjeremylt CeedChk(ierr); 1337241a4b83SYohann ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1338241a4b83SYohann CeedChk(ierr); 1339241a4b83SYohann 1340241a4b83SYohann ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1341241a4b83SYohann 1342241a4b83SYohann return 0; 1343241a4b83SYohann } 1344