1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16241a4b83SYohann #include <ceed-backend.h> 17241a4b83SYohann #include "ceed-cuda-gen.h" 18241a4b83SYohann #include <iostream> 19241a4b83SYohann #include <sstream> 20241a4b83SYohann #include "../cuda/ceed-cuda.h" 21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h" 22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 23241a4b83SYohann 24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 25241a4b83SYohann __device__ double atomicAdd(double *address, double val) { 26241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 27241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 28241a4b83SYohann do { 29241a4b83SYohann assumed = old; 30241a4b83SYohann old = 31241a4b83SYohann atomicCAS(address_as_ull, assumed, 32241a4b83SYohann __double_as_longlong(val + 33241a4b83SYohann __longlong_as_double(assumed))); 34241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 35241a4b83SYohann // (since NaN != NaN) 36241a4b83SYohann } while (assumed != old); 37241a4b83SYohann return __longlong_as_double(old); 38241a4b83SYohann } 39f1a13f77SYohann Dudouit ); 40f1a13f77SYohann Dudouit 41f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 42f1a13f77SYohann Dudouit 43f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45f1a13f77SYohann Dudouit 46f1a13f77SYohann Dudouit typedef struct { 47f1a13f77SYohann Dudouit CeedInt tidx; 48f1a13f77SYohann Dudouit CeedInt tidy; 49f1a13f77SYohann Dudouit CeedInt tidz; 50f1a13f77SYohann Dudouit CeedInt tid; 51f1a13f77SYohann Dudouit CeedScalar* slice; 52f1a13f77SYohann Dudouit } BackendData; 53241a4b83SYohann 54241a4b83SYohann template <int P, int Q> 55241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56ac421f39SYohann for (CeedInt i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57241a4b83SYohann B[i] = d_B[i]; 58241a4b83SYohann } 59241a4b83SYohann } 60241a4b83SYohann 61241a4b83SYohann //**** 62241a4b83SYohann // 1D 63241a4b83SYohann template <int NCOMP, int P1d> 648795c945Sjeremylt inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 65241a4b83SYohann if (data.tidx<P1d) 66241a4b83SYohann { 674d537eeaSYohann const CeedInt node = data.tidx; 684d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 69241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 708795c945Sjeremylt r_u[comp] = d_u[ind + nnodes * comp]; 71241a4b83SYohann } 72241a4b83SYohann } 73241a4b83SYohann } 74241a4b83SYohann 75241a4b83SYohann template <int NCOMP, int P1d> 768795c945Sjeremylt inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77241a4b83SYohann if (data.tidx<P1d) 78241a4b83SYohann { 794d537eeaSYohann const CeedInt node = data.tidx; 804d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 81241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 82241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 83241a4b83SYohann } 84241a4b83SYohann } 85241a4b83SYohann } 86241a4b83SYohann 87241a4b83SYohann template <int NCOMP, int Q1d> 88241a4b83SYohann inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 894d537eeaSYohann const CeedInt node = data.tidx; 904d537eeaSYohann const CeedInt ind = node + elem * Q1d; 91241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 92241a4b83SYohann r_u[comp] = d_u[ind + nquads * comp]; 93241a4b83SYohann } 94241a4b83SYohann } 95241a4b83SYohann 96241a4b83SYohann template <int NCOMP, int Q1d> 97241a4b83SYohann inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 984d537eeaSYohann const CeedInt node = data.tidx; 994d537eeaSYohann const CeedInt ind = node + elem * Q1d; 100241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 101241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 102241a4b83SYohann } 103241a4b83SYohann } 104241a4b83SYohann 105241a4b83SYohann template <int NCOMP, int P1d> 1068795c945Sjeremylt inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 107241a4b83SYohann if (data.tidx<P1d) 108241a4b83SYohann { 1094d537eeaSYohann const CeedInt node = data.tidx; 1104d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 111241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 1128795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 113241a4b83SYohann } 114241a4b83SYohann } 115241a4b83SYohann } 116241a4b83SYohann 117241a4b83SYohann template <int NCOMP, int P1d> 1188795c945Sjeremylt inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 119241a4b83SYohann if (data.tidx<P1d) 120241a4b83SYohann { 1214d537eeaSYohann const CeedInt node = data.tidx; 1224d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 123241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 124241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 125241a4b83SYohann } 126241a4b83SYohann } 127241a4b83SYohann } 128241a4b83SYohann 129241a4b83SYohann template <int NCOMP, int Q1d> 130241a4b83SYohann inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 1314d537eeaSYohann const CeedInt node = data.tidx; 1324d537eeaSYohann const CeedInt ind = node + elem * Q1d; 133241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 134241a4b83SYohann d_v[ind + nquads * comp] = r_v[comp]; 135241a4b83SYohann } 136241a4b83SYohann } 137241a4b83SYohann 138241a4b83SYohann template <int NCOMP, int Q1d> 139241a4b83SYohann inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 1404d537eeaSYohann const CeedInt node = data.tidx; 1414d537eeaSYohann const CeedInt ind = node + elem * Q1d; 142241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 143241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[comp]; 144241a4b83SYohann } 145241a4b83SYohann } 146241a4b83SYohann 147241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 148241a4b83SYohann inline __device__ void ContractX1d(BackendData& data, 149241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 150241a4b83SYohann data.slice[data.tidx] = *U; 151241a4b83SYohann __syncthreads(); 152241a4b83SYohann *V = 0.0; 153ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 154241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 155241a4b83SYohann } 156241a4b83SYohann __syncthreads(); 157241a4b83SYohann } 158241a4b83SYohann 159241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 160241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data, 161241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 162241a4b83SYohann data.slice[data.tidx] = *U; 163241a4b83SYohann __syncthreads(); 164241a4b83SYohann *V = 0.0; 165ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 166241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 167241a4b83SYohann } 168241a4b83SYohann __syncthreads(); 169241a4b83SYohann } 170241a4b83SYohann 171241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 172241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 173241a4b83SYohann CeedScalar *__restrict__ r_V) { 174ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 175241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 176241a4b83SYohann } 177241a4b83SYohann } 178241a4b83SYohann 179241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 180241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 181241a4b83SYohann CeedScalar *__restrict__ r_V) { 182ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 183241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 184241a4b83SYohann } 185241a4b83SYohann } 186241a4b83SYohann 187241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 188241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 189241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 190241a4b83SYohann CeedScalar *__restrict__ r_V) { 191ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 192241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 193241a4b83SYohann } 194241a4b83SYohann } 195241a4b83SYohann 196241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 197241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 198241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 199241a4b83SYohann CeedScalar *__restrict__ r_V) { 200ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 201241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 202241a4b83SYohann } 203241a4b83SYohann } 204241a4b83SYohann 205241a4b83SYohann //**** 206241a4b83SYohann // 2D 207241a4b83SYohann template <int NCOMP, int P1d> 2088795c945Sjeremylt inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 209241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 210241a4b83SYohann { 2114d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 2124d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 213241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 2148795c945Sjeremylt r_u[comp] = d_u[ind + nnodes * comp]; 215241a4b83SYohann } 216241a4b83SYohann } 217241a4b83SYohann } 218241a4b83SYohann 219241a4b83SYohann template <int NCOMP, int P1d> 2208795c945Sjeremylt inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 221241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 222241a4b83SYohann { 2234d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 2244d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 225241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 226241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 227241a4b83SYohann } 228241a4b83SYohann } 229241a4b83SYohann } 230241a4b83SYohann 231241a4b83SYohann template <int NCOMP, int Q1d> 232241a4b83SYohann inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 2334d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 2344d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 235241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 236241a4b83SYohann r_u[comp] = d_u[ind + nquads * comp]; 237241a4b83SYohann } 238241a4b83SYohann } 239241a4b83SYohann 240241a4b83SYohann template <int NCOMP, int Q1d> 241241a4b83SYohann inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 2424d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 2434d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 244241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 245241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 246241a4b83SYohann } 247241a4b83SYohann } 248241a4b83SYohann 249241a4b83SYohann template <int NCOMP, int P1d> 2508795c945Sjeremylt inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 251241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 252241a4b83SYohann { 2534d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 2544d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 255241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 2568795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 257241a4b83SYohann } 258241a4b83SYohann } 259241a4b83SYohann } 260241a4b83SYohann 261241a4b83SYohann template <int NCOMP, int P1d> 2628795c945Sjeremylt inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 263241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 264241a4b83SYohann { 2654d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 2664d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 267241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 268241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 269241a4b83SYohann } 270241a4b83SYohann } 271241a4b83SYohann } 272241a4b83SYohann 273241a4b83SYohann template <int NCOMP, int Q1d> 274241a4b83SYohann inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 2754d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 2764d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 277241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 278241a4b83SYohann d_v[ind + nquads * comp] = r_v[comp]; 279241a4b83SYohann } 280241a4b83SYohann } 281241a4b83SYohann 282241a4b83SYohann template <int NCOMP, int Q1d> 283241a4b83SYohann inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 2844d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 2854d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 286241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 287241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[comp]; 288241a4b83SYohann } 289241a4b83SYohann } 290241a4b83SYohann 291241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 292241a4b83SYohann inline __device__ void ContractX2d(BackendData& data, 293241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 294241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 295241a4b83SYohann __syncthreads(); 296241a4b83SYohann *V = 0.0; 297ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 298241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 299241a4b83SYohann } 300241a4b83SYohann __syncthreads(); 301241a4b83SYohann } 302241a4b83SYohann 303241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 304241a4b83SYohann inline __device__ void ContractY2d(BackendData& data, 305241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 306241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 307241a4b83SYohann __syncthreads(); 308241a4b83SYohann *V = 0.0; 309ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 310241a4b83SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 311241a4b83SYohann } 312241a4b83SYohann __syncthreads(); 313241a4b83SYohann } 314241a4b83SYohann 315241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 316241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data, 317241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 318241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 319241a4b83SYohann __syncthreads(); 320241a4b83SYohann *V = 0.0; 321241a4b83SYohann if (data.tidy<P1d) { 322ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 323241a4b83SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 324241a4b83SYohann } 325241a4b83SYohann } 326241a4b83SYohann __syncthreads(); 327241a4b83SYohann } 328241a4b83SYohann 329241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 330241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data, 331241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 332241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 333241a4b83SYohann __syncthreads(); 334241a4b83SYohann *V = 0.0; 335241a4b83SYohann if (data.tidx<P1d) { 336ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 337241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 338241a4b83SYohann } 339241a4b83SYohann } 340241a4b83SYohann __syncthreads(); 341241a4b83SYohann } 342241a4b83SYohann 343241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 344241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data, 345241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 346241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 347241a4b83SYohann __syncthreads(); 348241a4b83SYohann if (data.tidx<P1d) { 349ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 350241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 351241a4b83SYohann } 352241a4b83SYohann } 353241a4b83SYohann __syncthreads(); 354241a4b83SYohann } 355241a4b83SYohann 356241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 357241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 358241a4b83SYohann CeedScalar *__restrict__ r_V) { 359241a4b83SYohann CeedScalar r_t[1]; 360ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 361241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 362241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 363241a4b83SYohann } 364241a4b83SYohann } 365241a4b83SYohann 366241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 367241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 368241a4b83SYohann CeedScalar *__restrict__ r_V) { 369241a4b83SYohann CeedScalar r_t[1]; 370ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 371241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 372241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 373241a4b83SYohann } 374241a4b83SYohann } 375241a4b83SYohann 376241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 377241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 378241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 379241a4b83SYohann CeedScalar *__restrict__ r_V) { 380241a4b83SYohann CeedScalar r_t[1]; 381ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 382241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 383241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 384241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 385241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 386241a4b83SYohann } 387241a4b83SYohann } 388241a4b83SYohann 389241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 390241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 391241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 392241a4b83SYohann CeedScalar *__restrict__ r_V) { 393241a4b83SYohann CeedScalar r_t[1]; 394ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 395241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 396241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 397241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 398241a4b83SYohann ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 399241a4b83SYohann } 400241a4b83SYohann } 401241a4b83SYohann 402241a4b83SYohann //**** 403241a4b83SYohann // 3D 404241a4b83SYohann template <int NCOMP, int P1d> 4058795c945Sjeremylt inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 406241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 407241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4084d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 4094d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 410241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 4118795c945Sjeremylt r_u[z+comp*P1d] = d_u[ind + nnodes * comp]; 412241a4b83SYohann } 413241a4b83SYohann } 414241a4b83SYohann } 415241a4b83SYohann } 416241a4b83SYohann 417241a4b83SYohann template <int NCOMP, int P1d> 4188795c945Sjeremylt inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 419241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 420241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4214d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 4224d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 423241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 424241a4b83SYohann r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 425241a4b83SYohann } 426241a4b83SYohann } 427241a4b83SYohann } 428241a4b83SYohann } 429241a4b83SYohann 430241a4b83SYohann template <int NCOMP, int Q1d> 431241a4b83SYohann inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 432241a4b83SYohann for (CeedInt z=0; z < Q1d; ++z) { 4334d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 4344d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 435241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 436241a4b83SYohann r_u[z+comp*Q1d] = d_u[ind + nquads * comp]; 437241a4b83SYohann } 438241a4b83SYohann } 439241a4b83SYohann } 440241a4b83SYohann 441241a4b83SYohann template <int NCOMP, int Q1d> 442ac421f39SYohann inline __device__ void readSliceQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 443ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 444ac421f39SYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 445ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 446ac421f39SYohann r_u[comp] = d_u[ind + nquads * comp]; 447ac421f39SYohann } 448ac421f39SYohann } 449ac421f39SYohann 450ac421f39SYohann template <int NCOMP, int Q1d> 451241a4b83SYohann inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 452241a4b83SYohann for (CeedInt z=0; z < Q1d; ++z) { 4534d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 4544d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 455241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 456241a4b83SYohann r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp]; 457241a4b83SYohann } 458241a4b83SYohann } 459241a4b83SYohann } 460241a4b83SYohann 461ac421f39SYohann template <int NCOMP, int Q1d> 462ac421f39SYohann inline __device__ void readSliceQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 463ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 464ac421f39SYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 465ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 466ac421f39SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 467ac421f39SYohann } 468ac421f39SYohann } 469ac421f39SYohann 470241a4b83SYohann template <int NCOMP, int P1d> 4718795c945Sjeremylt inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 472241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 473241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4744d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 4754d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 476241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 4778795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]); 478241a4b83SYohann } 479241a4b83SYohann } 480241a4b83SYohann } 481241a4b83SYohann } 482241a4b83SYohann 483241a4b83SYohann template <int NCOMP, int P1d> 4848795c945Sjeremylt inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 485241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 486241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4874d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 4884d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 489241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 490241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 491241a4b83SYohann } 492241a4b83SYohann } 493241a4b83SYohann } 494241a4b83SYohann } 495241a4b83SYohann 496241a4b83SYohann template <int NCOMP, int Q1d> 497241a4b83SYohann inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 498241a4b83SYohann for (CeedInt z=0; z < Q1d; ++z) { 4994d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 5004d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 501241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 502241a4b83SYohann d_v[ind + nquads * comp] = r_v[z+comp*Q1d]; 503241a4b83SYohann } 504241a4b83SYohann } 505241a4b83SYohann } 506241a4b83SYohann 507241a4b83SYohann template <int NCOMP, int Q1d> 508241a4b83SYohann inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 509241a4b83SYohann for (CeedInt z=0; z < Q1d; ++z) { 5104d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 5114d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 512241a4b83SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 513241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d]; 514241a4b83SYohann } 515241a4b83SYohann } 516241a4b83SYohann } 517241a4b83SYohann 518241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 519241a4b83SYohann inline __device__ void ContractX3d(BackendData& data, 520241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 521ac421f39SYohann CeedScalar r_B[P1d]; 522ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 523ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 524ac421f39SYohann } 525ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 526241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 527241a4b83SYohann __syncthreads(); 528241a4b83SYohann V[k] = 0.0; 529ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 530ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 531241a4b83SYohann } 532241a4b83SYohann __syncthreads(); 533241a4b83SYohann } 534241a4b83SYohann } 535241a4b83SYohann 536241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 537241a4b83SYohann inline __device__ void ContractY3d(BackendData& data, 538241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 539ac421f39SYohann CeedScalar r_B[P1d]; 540ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 541ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 542ac421f39SYohann } 543ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 544241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 545241a4b83SYohann __syncthreads(); 546241a4b83SYohann V[k] = 0.0; 547ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 548ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 549241a4b83SYohann } 550241a4b83SYohann __syncthreads(); 551241a4b83SYohann } 552241a4b83SYohann } 553241a4b83SYohann 554241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 555241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data, 556241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 557ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 558241a4b83SYohann V[k] = 0.0; 559ac421f39SYohann for (CeedInt i = 0; i < P1d; ++i) { 560241a4b83SYohann V[k] += B[i + k*P1d] * U[i];//contract z direction 561241a4b83SYohann } 562241a4b83SYohann } 563241a4b83SYohann } 564241a4b83SYohann 565241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 566241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data, 567241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 568ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 569241a4b83SYohann V[k] = 0.0; 570241a4b83SYohann if (k<P1d) { 571ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 572241a4b83SYohann V[k] += B[k + i*P1d] * U[i];//contract z direction 573241a4b83SYohann } 574241a4b83SYohann } 575241a4b83SYohann } 576241a4b83SYohann } 577241a4b83SYohann 578241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 579241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data, 580241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 581ac421f39SYohann CeedScalar r_B[Q1d]; 582ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 583ac421f39SYohann { 584ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 585ac421f39SYohann } 586ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 587241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 588241a4b83SYohann __syncthreads(); 589241a4b83SYohann V[k] = 0.0; 590241a4b83SYohann if (data.tidy<P1d) { 591ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 592ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 593ac421f39SYohann } 594ac421f39SYohann } 595ac421f39SYohann __syncthreads(); 596ac421f39SYohann } 597ac421f39SYohann } 598ac421f39SYohann 599ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 600ac421f39SYohann inline __device__ void ContractTransposeAddY3d(BackendData& data, 601ac421f39SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 602ac421f39SYohann CeedScalar r_B[Q1d]; 603ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 604ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 605ac421f39SYohann } 606ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 607ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 608ac421f39SYohann __syncthreads(); 609ac421f39SYohann if (data.tidy<P1d) { 610ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 611ac421f39SYohann V[k] += r_B[i] * data.slice[data.tidx + i*Q1d];//contract y direction 612241a4b83SYohann } 613241a4b83SYohann } 614241a4b83SYohann __syncthreads(); 615241a4b83SYohann } 616241a4b83SYohann } 617241a4b83SYohann 618241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 619241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data, 620241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 621ac421f39SYohann CeedScalar r_B[Q1d]; 622ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 623ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 624ac421f39SYohann } 625ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 626241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 627241a4b83SYohann __syncthreads(); 628241a4b83SYohann V[k] = 0.0; 629241a4b83SYohann if (data.tidx<P1d) { 630ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 631ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 632241a4b83SYohann } 633241a4b83SYohann } 634241a4b83SYohann __syncthreads(); 635241a4b83SYohann } 636241a4b83SYohann } 637241a4b83SYohann 638241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 639241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data, 640241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 641ac421f39SYohann CeedScalar r_B[Q1d]; 642ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 643ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 644ac421f39SYohann } 645ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 646241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 647241a4b83SYohann __syncthreads(); 648241a4b83SYohann if (data.tidx<P1d) { 649ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 650ac421f39SYohann V[k] += r_B[i] * data.slice[i + data.tidy*Q1d];//contract x direction 651241a4b83SYohann } 652241a4b83SYohann } 653241a4b83SYohann __syncthreads(); 654241a4b83SYohann } 655241a4b83SYohann } 656241a4b83SYohann 657241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 658241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 659241a4b83SYohann CeedScalar *__restrict__ r_V) { 660241a4b83SYohann CeedScalar r_t1[Q1d]; 661241a4b83SYohann CeedScalar r_t2[Q1d]; 662ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 663241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 664241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 665241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 666241a4b83SYohann } 667241a4b83SYohann } 668241a4b83SYohann 669241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 670241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 671241a4b83SYohann CeedScalar *__restrict__ r_V) { 672241a4b83SYohann CeedScalar r_t1[Q1d]; 673241a4b83SYohann CeedScalar r_t2[Q1d]; 674ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 675241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 676241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 677241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 678241a4b83SYohann } 679241a4b83SYohann } 680241a4b83SYohann 681241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 682241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 683241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 684241a4b83SYohann CeedScalar *__restrict__ r_V) { 685241a4b83SYohann CeedScalar r_t1[Q1d]; 686241a4b83SYohann CeedScalar r_t2[Q1d]; 687ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 688241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 689241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 690241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 691241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 692241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 693241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 694241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 695241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 696241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 697241a4b83SYohann } 698241a4b83SYohann } 699241a4b83SYohann 700241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 701241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 702241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 703241a4b83SYohann CeedScalar *__restrict__ r_V) { 704241a4b83SYohann CeedScalar r_t1[Q1d]; 705241a4b83SYohann CeedScalar r_t2[Q1d]; 706ac421f39SYohann for (CeedInt comp=0; comp<NCOMP; comp++) { 707241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 708241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 709241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 710241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 711241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 712241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 713241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 714241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 715241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 716241a4b83SYohann } 717241a4b83SYohann } 718241a4b83SYohann 719ac421f39SYohann template <int NCOMP, int Q1d> 720ac421f39SYohann inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, 721ac421f39SYohann const CeedScalar *__restrict__ r_U, 722ac421f39SYohann const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 723ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 724ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[q + comp*Q1d]; 725ac421f39SYohann __syncthreads(); 726ac421f39SYohann // X derivative 727ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 728ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 729ac421f39SYohann r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 730ac421f39SYohann } 731ac421f39SYohann // Y derivative 732ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 733ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 734ac421f39SYohann r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 735ac421f39SYohann } 736ac421f39SYohann // Z derivative 737ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 738ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 739ac421f39SYohann r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d];//contract z direction (Z derivative) 740ac421f39SYohann } 741ac421f39SYohann __syncthreads(); 742ac421f39SYohann } 743ac421f39SYohann } 744ac421f39SYohann 745ac421f39SYohann template <int NCOMP, int Q1d> 746ac421f39SYohann inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, 747ac421f39SYohann const CeedScalar *__restrict__ r_U, 748ac421f39SYohann const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 749ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 750ac421f39SYohann // X derivative 751ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 752ac421f39SYohann __syncthreads(); 753ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 754ac421f39SYohann r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d];//contract x direction (X derivative) 755ac421f39SYohann } 756ac421f39SYohann __syncthreads(); 757ac421f39SYohann // Y derivative 758ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 759ac421f39SYohann __syncthreads(); 760ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 761ac421f39SYohann r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d];//contract y direction (Y derivative) 762ac421f39SYohann } 763ac421f39SYohann __syncthreads(); 764ac421f39SYohann // Z derivative 765ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) { 766ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP];// PARTIAL contract z direction (Z derivative) 767ac421f39SYohann } 768ac421f39SYohann } 769ac421f39SYohann } 770ac421f39SYohann 771241a4b83SYohann template <int Q1d> 772241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 773241a4b83SYohann *w = qweight1d[data.tidx]; 774241a4b83SYohann } 775241a4b83SYohann 776241a4b83SYohann template <int Q1d> 777241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 778241a4b83SYohann *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 779241a4b83SYohann } 780241a4b83SYohann 781241a4b83SYohann template <int Q1d> 782241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 783241a4b83SYohann const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 784ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 785241a4b83SYohann { 786241a4b83SYohann w[z] = pw*qweight1d[z]; 787241a4b83SYohann } 788241a4b83SYohann } 789241a4b83SYohann 790241a4b83SYohann ); 791241a4b83SYohann 792241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 793241a4b83SYohann 794241a4b83SYohann using std::ostringstream; 795241a4b83SYohann using std::string; 796241a4b83SYohann int ierr; 797241a4b83SYohann bool setupdone; 798241a4b83SYohann ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 799241a4b83SYohann if (setupdone) return 0; 800241a4b83SYohann Ceed ceed; 801241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 802241a4b83SYohann CeedOperator_Cuda_gen *data; 803241a4b83SYohann ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 804241a4b83SYohann CeedQFunction qf; 805241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 806241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 807241a4b83SYohann ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 8088795c945Sjeremylt CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes; 809241a4b83SYohann ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 810241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 811241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 812241a4b83SYohann CeedChk(ierr); 813241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 814241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 815241a4b83SYohann CeedChk(ierr); 816241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 817241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 818241a4b83SYohann CeedChk(ierr); 819241a4b83SYohann CeedEvalMode emode; 820*61dbc9d2Sjeremylt CeedInterlaceMode imode; 821241a4b83SYohann CeedBasis basis; 822241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 823241a4b83SYohann CeedElemRestriction Erestrict; 824241a4b83SYohann CeedElemRestriction_Cuda_reg *restr_data; 825241a4b83SYohann 826241a4b83SYohann ostringstream code; 827241a4b83SYohann string devFunctions(deviceFunctions); 828241a4b83SYohann 829f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 830f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 831f1a13f77SYohann Dudouit Ceed delegate; 832f1a13f77SYohann Dudouit CeedGetDelegate(ceed, &delegate); 833f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 834f1a13f77SYohann Dudouit ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr); 835f1a13f77SYohann Dudouit ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 836f1a13f77SYohann Dudouit if (prop.major<6){ 837f1a13f77SYohann Dudouit code << atomicAdd; 838f1a13f77SYohann Dudouit } 839f1a13f77SYohann Dudouit 840241a4b83SYohann code << devFunctions; 841241a4b83SYohann 842241a4b83SYohann string qFunction(qf_data->qFunctionSource); 8434d537eeaSYohann 8444d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 845ee07ded2SValeria Barra code << "\n#define CeedPragmaSIMD\n"; 846241a4b83SYohann code << qFunction; 847241a4b83SYohann 848241a4b83SYohann // Setup 849241a4b83SYohann code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 850241a4b83SYohann // Input Evecs and Restriction 851241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 852241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 853241a4b83SYohann CeedChk(ierr); 854241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 855241a4b83SYohann } else { 856241a4b83SYohann code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 857241a4b83SYohann if (emode != CEED_EVAL_NONE) 858241a4b83SYohann { 859241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 860241a4b83SYohann bool isTensor; 861241a4b83SYohann ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 862241a4b83SYohann //TODO check that all are the same 863241a4b83SYohann ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 864241a4b83SYohann if (isTensor) 865241a4b83SYohann { 866241a4b83SYohann //TODO check that all are the same 867241a4b83SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 868241a4b83SYohann } else { 869241a4b83SYohann return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 870241a4b83SYohann } 871241a4b83SYohann } 872241a4b83SYohann } 873241a4b83SYohann } 874241a4b83SYohann data->dim = dim; 875241a4b83SYohann data->Q1d = Q1d; 876241a4b83SYohann 877241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 878241a4b83SYohann code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 879241a4b83SYohann } 880241a4b83SYohann code << "const CeedInt Dim = "<<dim<<";\n"; 881241a4b83SYohann code << "const CeedInt Q1d = "<<Q1d<<";\n"; 882241a4b83SYohann // code << "const CeedInt Q = "<<Q<<";\n"; 883241a4b83SYohann code << "extern __shared__ CeedScalar slice[];\n"; 884241a4b83SYohann code << "BackendData data;\n"; 885241a4b83SYohann code << "data.tidx = threadIdx.x;\n"; 886241a4b83SYohann code << "data.tidy = threadIdx.y;\n"; 887241a4b83SYohann code << "data.tidz = threadIdx.z;\n"; 888241a4b83SYohann code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 889241a4b83SYohann code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 890ac421f39SYohann //Initialize constants, and matrices B and G 891241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 892241a4b83SYohann code << "// Input field "<<i<<"\n"; 893241a4b83SYohann // Get elemsize, emode, ncomp 894241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 895241a4b83SYohann CeedChk(ierr); 896241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 897241a4b83SYohann CeedChk(ierr); 898241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 899241a4b83SYohann CeedChk(ierr); 9004d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 901241a4b83SYohann CeedChk(ierr); 902241a4b83SYohann // Basis action 903241a4b83SYohann switch (emode) { 904241a4b83SYohann case CEED_EVAL_NONE: 9058795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 906241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 9078795c945Sjeremylt code << " const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n"; 908241a4b83SYohann break; 909241a4b83SYohann case CEED_EVAL_INTERP: 910241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 911241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 9128795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 913241a4b83SYohann code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 914241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 9158795c945Sjeremylt code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 916241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 917241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 918241a4b83SYohann code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 919241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 920241a4b83SYohann break; 921241a4b83SYohann case CEED_EVAL_GRAD: 922241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 9238795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 924241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 9258795c945Sjeremylt code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 926241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 927241a4b83SYohann code << "const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 928241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 929241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 930241a4b83SYohann code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 931241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 932ac421f39SYohann if (basis_data->d_collograd1d) { 933ac421f39SYohann data->G.in[i] = basis_data->d_collograd1d; 934ac421f39SYohann code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 935ac421f39SYohann code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 936ac421f39SYohann } else { 937ac421f39SYohann data->G.in[i] = basis_data->d_grad1d; 938ac421f39SYohann code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 939241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 940ac421f39SYohann } 941241a4b83SYohann break; 942241a4b83SYohann case CEED_EVAL_WEIGHT: 943241a4b83SYohann break; // No action 944241a4b83SYohann case CEED_EVAL_DIV: 945241a4b83SYohann break; // TODO: Not implemented 946241a4b83SYohann case CEED_EVAL_CURL: 947241a4b83SYohann break; // TODO: Not implemented 948241a4b83SYohann } 949241a4b83SYohann } 950241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 951241a4b83SYohann code << "// Output field "<<i<<"\n"; 952241a4b83SYohann // Get elemsize, emode, ncomp 953241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 954241a4b83SYohann CeedChk(ierr); 955241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 956241a4b83SYohann CeedChk(ierr); 957241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 958241a4b83SYohann CeedChk(ierr); 9594d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 960241a4b83SYohann CeedChk(ierr); 961241a4b83SYohann // Basis action 962241a4b83SYohann switch (emode) { 963241a4b83SYohann case CEED_EVAL_NONE: 964241a4b83SYohann code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 9658795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 966*61dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 9678795c945Sjeremylt code << " const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n"; 968241a4b83SYohann break; // No action 969241a4b83SYohann case CEED_EVAL_INTERP: 970241a4b83SYohann code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 971241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 972241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 973241a4b83SYohann code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 974241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 975241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 976241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 977241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 9788795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 9798795c945Sjeremylt code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 980241a4b83SYohann break; 981241a4b83SYohann case CEED_EVAL_GRAD: 982241a4b83SYohann code << "const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 983241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 984241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 985241a4b83SYohann code << "const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 986241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 987241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 988241a4b83SYohann code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 989241a4b83SYohann code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 990ac421f39SYohann if (basis_data->d_collograd1d) { 991ac421f39SYohann data->G.out[i] = basis_data->d_collograd1d; 992ac421f39SYohann code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 993ac421f39SYohann code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 994ac421f39SYohann } else { 995ac421f39SYohann data->G.out[i] = basis_data->d_grad1d; 996ac421f39SYohann code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 997241a4b83SYohann code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 998ac421f39SYohann } 9998795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 10008795c945Sjeremylt code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 1001241a4b83SYohann break; 1002241a4b83SYohann case CEED_EVAL_WEIGHT: { 1003241a4b83SYohann Ceed ceed; 1004241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1005241a4b83SYohann return CeedError(ceed, 1, 1006241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1007241a4b83SYohann break; // Should not occur 1008241a4b83SYohann } 1009241a4b83SYohann case CEED_EVAL_DIV: 1010241a4b83SYohann break; // TODO: Not implemented 1011241a4b83SYohann case CEED_EVAL_CURL: 1012241a4b83SYohann break; // TODO: Not implemented 1013241a4b83SYohann } 1014241a4b83SYohann } 1015ac421f39SYohann code << "\n"; 1016ac421f39SYohann code << "__syncthreads();\n"; 1017241a4b83SYohann code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1018241a4b83SYohann // Input basis apply if needed 1019ac421f39SYohann // Generate the correct eval mode code for each input 1020241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1021241a4b83SYohann code << " // Input field "<<i<<"\n"; 1022241a4b83SYohann // Get elemsize, emode, ncomp 1023241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1024241a4b83SYohann CeedChk(ierr); 1025241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1026241a4b83SYohann CeedChk(ierr); 1027241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1028241a4b83SYohann CeedChk(ierr); 10294d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1030241a4b83SYohann CeedChk(ierr); 1031241a4b83SYohann // Basis action 1032241a4b83SYohann switch (emode) { 1033241a4b83SYohann case CEED_EVAL_NONE: 1034ac421f39SYohann if (!basis_data->d_collograd1d) { 1035241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1036*61dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1037*61dbc9d2Sjeremylt code << " readQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n"; 1038ac421f39SYohann } 1039241a4b83SYohann break; 1040241a4b83SYohann case CEED_EVAL_INTERP: 1041241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1042*61dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1043241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1044241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 1045*61dbc9d2Sjeremylt code << " readDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1046241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1047241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1048241a4b83SYohann break; 1049241a4b83SYohann case CEED_EVAL_GRAD: 1050241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 1051*61dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1052241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1053241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 1054*61dbc9d2Sjeremylt code << " readDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1055ac421f39SYohann if (basis_data->d_collograd1d) { 1056ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1057ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1058ac421f39SYohann } else { 1059241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1060241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1061ac421f39SYohann } 1062241a4b83SYohann break; 1063241a4b83SYohann case CEED_EVAL_WEIGHT: 1064241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1065241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1066241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1067241a4b83SYohann data->W = basis_data->d_qweight1d; 1068241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1069241a4b83SYohann break; // No action 1070241a4b83SYohann case CEED_EVAL_DIV: 1071241a4b83SYohann break; // TODO: Not implemented 1072241a4b83SYohann case CEED_EVAL_CURL: 1073241a4b83SYohann break; // TODO: Not implemented 1074241a4b83SYohann } 1075241a4b83SYohann } 1076ac421f39SYohann 1077241a4b83SYohann // Q function 1078241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1079ac421f39SYohann code << " // Output field "<<i<<"\n"; 1080241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1081241a4b83SYohann CeedChk(ierr); 1082241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1083241a4b83SYohann { 1084ac421f39SYohann if (basis_data->d_collograd1d) { 1085ac421f39SYohann //Accumulator for gradient slices 1086ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1087ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1088ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1089ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1090ac421f39SYohann code << " }\n"; 1091ac421f39SYohann code << " }\n"; 1092ac421f39SYohann } else { 1093241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1094241a4b83SYohann } 1095ac421f39SYohann } 1096241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1097241a4b83SYohann { 1098241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1099241a4b83SYohann } 1100241a4b83SYohann } 1101ac421f39SYohann //We treat quadrature points per slice in 3d to save registers 1102ac421f39SYohann code << "// QFunction\n"; 1103ac421f39SYohann if (basis_data->d_collograd1d) { 1104ac421f39SYohann code << "#pragma unroll\n"; 1105ac421f39SYohann code << "for (CeedInt q=0; q<Q1d; q++) {\n"; 1106ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1107ac421f39SYohann code << " // Input field "<<i<<"\n"; 1108ac421f39SYohann // Get elemsize, emode, ncomp 1109ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1110ac421f39SYohann CeedChk(ierr); 1111ac421f39SYohann // Basis action 1112ac421f39SYohann switch (emode) { 1113ac421f39SYohann case CEED_EVAL_NONE: 1114ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1115*61dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1116*61dbc9d2Sjeremylt code << " readSliceQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<"3d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1117ac421f39SYohann break; 1118ac421f39SYohann case CEED_EVAL_INTERP: 1119ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1120ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1121ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1122ac421f39SYohann code << " }\n"; 1123ac421f39SYohann break; 1124ac421f39SYohann case CEED_EVAL_GRAD: 1125ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1126ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1127ac421f39SYohann break; 1128ac421f39SYohann case CEED_EVAL_WEIGHT: 1129ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1130ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1131ac421f39SYohann break; // No action 1132ac421f39SYohann case CEED_EVAL_DIV: 1133ac421f39SYohann break; // TODO: Not implemented 1134ac421f39SYohann case CEED_EVAL_CURL: 1135ac421f39SYohann break; // TODO: Not implemented 1136ac421f39SYohann } 1137ac421f39SYohann } 1138ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1139ac421f39SYohann code << " // Output field "<<i<<"\n"; 1140ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1141ac421f39SYohann CeedChk(ierr); 1142ac421f39SYohann // Basis action 1143ac421f39SYohann switch (emode) { 1144ac421f39SYohann case CEED_EVAL_NONE: 1145ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1146ac421f39SYohann break; // No action 1147ac421f39SYohann case CEED_EVAL_INTERP: 1148ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1149ac421f39SYohann break; 1150ac421f39SYohann case CEED_EVAL_GRAD: 1151ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1152ac421f39SYohann break; 1153ac421f39SYohann case CEED_EVAL_WEIGHT: 1154ac421f39SYohann break; // Should not occur 1155ac421f39SYohann case CEED_EVAL_DIV: 1156ac421f39SYohann break; // TODO: Not implemented 1157ac421f39SYohann case CEED_EVAL_CURL: 1158ac421f39SYohann break; // TODO: Not implemented 1159ac421f39SYohann } 1160ac421f39SYohann } 1161ac421f39SYohann } else { 1162ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1163ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1164ac421f39SYohann } 1165ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1166ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1167ac421f39SYohann } 1168ac421f39SYohann } 11694d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 11704d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1171ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 11724d537eeaSYohann } 11734d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 11744d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1175ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 11764d537eeaSYohann } 1177241a4b83SYohann string qFunctionName(qf_data->qFunctionName); 1178ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 1179ac421f39SYohann if (dim != 3 || basis_data->d_collograd1d) { 1180ac421f39SYohann code << "1 "; 1181ac421f39SYohann }else{ 1182ac421f39SYohann code << "Q1d "; 1183ac421f39SYohann } 1184ac421f39SYohann code << ", in, out);\n"; 1185ac421f39SYohann if (basis_data->d_collograd1d) { 1186ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1187ac421f39SYohann code << " // Output field "<<i<<"\n"; 1188ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1189ac421f39SYohann CeedChk(ierr); 1190ac421f39SYohann // Basis action 1191ac421f39SYohann switch (emode) { 1192ac421f39SYohann case CEED_EVAL_NONE: 1193ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1194ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1195ac421f39SYohann code << " }\n"; 1196ac421f39SYohann break; // No action 1197ac421f39SYohann case CEED_EVAL_INTERP: 1198ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1199ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1200ac421f39SYohann code << " }\n"; 1201ac421f39SYohann break; 1202ac421f39SYohann case CEED_EVAL_GRAD: 1203ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1204ac421f39SYohann break; 1205ac421f39SYohann case CEED_EVAL_WEIGHT: 1206ac421f39SYohann break; // Should not occur 1207ac421f39SYohann case CEED_EVAL_DIV: 1208ac421f39SYohann break; // TODO: Not implemented 1209ac421f39SYohann case CEED_EVAL_CURL: 1210ac421f39SYohann break; // TODO: Not implemented 1211ac421f39SYohann } 1212ac421f39SYohann } 1213ac421f39SYohann code << "}\n"; 1214ac421f39SYohann } 1215241a4b83SYohann 1216241a4b83SYohann // Output basis apply if needed 1217ac421f39SYohann // Generate the correct eval mode code for each output 1218241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1219241a4b83SYohann code << " // Output field "<<i<<"\n"; 1220241a4b83SYohann // Get elemsize, emode, ncomp 1221241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1222241a4b83SYohann CeedChk(ierr); 1223241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1224241a4b83SYohann CeedChk(ierr); 1225241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1226241a4b83SYohann CeedChk(ierr); 12274d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1228241a4b83SYohann CeedChk(ierr); 1229241a4b83SYohann // Basis action 1230241a4b83SYohann switch (emode) { 1231241a4b83SYohann case CEED_EVAL_NONE: 1232*61dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1233*61dbc9d2Sjeremylt code << " writeQuads"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n"; 1234241a4b83SYohann break; // No action 1235241a4b83SYohann case CEED_EVAL_INTERP: 1236241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1237241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1238*61dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1239241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1240241a4b83SYohann data->indices.out[i] = restr_data->d_ind; 1241*61dbc9d2Sjeremylt code << " writeDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1242241a4b83SYohann break; 1243241a4b83SYohann case CEED_EVAL_GRAD: 1244241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1245ac421f39SYohann if (basis_data->d_collograd1d) { 1246ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1247ac421f39SYohann } else { 1248241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1249ac421f39SYohann } 1250*61dbc9d2Sjeremylt ierr = CeedElemRestrictionGetIMode(Erestrict, &imode); CeedChk(ierr); 1251241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1252241a4b83SYohann data->indices.out[i] = restr_data->d_ind; 1253*61dbc9d2Sjeremylt code << " writeDofs"<<(imode==CEED_NONINTERLACED?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1254241a4b83SYohann break; 1255241a4b83SYohann case CEED_EVAL_WEIGHT: { 1256241a4b83SYohann Ceed ceed; 1257241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1258241a4b83SYohann return CeedError(ceed, 1, 1259241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1260241a4b83SYohann break; // Should not occur 1261241a4b83SYohann } 1262241a4b83SYohann case CEED_EVAL_DIV: 1263241a4b83SYohann break; // TODO: Not implemented 1264241a4b83SYohann case CEED_EVAL_CURL: 1265241a4b83SYohann break; // TODO: Not implemented 1266241a4b83SYohann } 1267241a4b83SYohann } 1268241a4b83SYohann 1269241a4b83SYohann code << " }\n"; 1270241a4b83SYohann code << "}\n\n"; 1271241a4b83SYohann 1272241a4b83SYohann // std::cout << code.str(); 1273241a4b83SYohann 1274241a4b83SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr); 1275241a4b83SYohann ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1276241a4b83SYohann CeedChk(ierr); 1277241a4b83SYohann 1278241a4b83SYohann ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1279241a4b83SYohann 1280241a4b83SYohann return 0; 1281241a4b83SYohann } 1282