1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16241a4b83SYohann #include <ceed-backend.h> 17241a4b83SYohann #include "ceed-cuda-gen.h" 18241a4b83SYohann #include <iostream> 19241a4b83SYohann #include <sstream> 20241a4b83SYohann #include "../cuda/ceed-cuda.h" 21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h" 22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 23241a4b83SYohann 24f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 25241a4b83SYohann __device__ double atomicAdd(double *address, double val) { 26241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 27241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 28241a4b83SYohann do { 29241a4b83SYohann assumed = old; 30241a4b83SYohann old = 31241a4b83SYohann atomicCAS(address_as_ull, assumed, 32241a4b83SYohann __double_as_longlong(val + 33241a4b83SYohann __longlong_as_double(assumed))); 34241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 35241a4b83SYohann // (since NaN != NaN) 36241a4b83SYohann } while (assumed != old); 37241a4b83SYohann return __longlong_as_double(old); 38241a4b83SYohann } 39f1a13f77SYohann Dudouit ); 40f1a13f77SYohann Dudouit 41f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 42f1a13f77SYohann Dudouit 43f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45f1a13f77SYohann Dudouit 46f1a13f77SYohann Dudouit typedef struct { 47f1a13f77SYohann Dudouit CeedInt tidx; 48f1a13f77SYohann Dudouit CeedInt tidy; 49f1a13f77SYohann Dudouit CeedInt tidz; 50f1a13f77SYohann Dudouit CeedInt tid; 51f1a13f77SYohann Dudouit CeedScalar* slice; 52f1a13f77SYohann Dudouit } BackendData; 53241a4b83SYohann 54241a4b83SYohann template <int P, int Q> 55241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56241a4b83SYohann for(int i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57241a4b83SYohann B[i] = d_B[i]; 58241a4b83SYohann } 59241a4b83SYohann } 60241a4b83SYohann 61241a4b83SYohann //**** 62241a4b83SYohann // 1D 63241a4b83SYohann template <int NCOMP, int P1d> 648795c945Sjeremylt inline __device__ void readDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 65241a4b83SYohann if (data.tidx<P1d) 66241a4b83SYohann { 67*4d537eeaSYohann const CeedInt node = data.tidx; 68*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 69241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 708795c945Sjeremylt r_u[comp] = d_u[ind + nnodes * comp]; 71241a4b83SYohann } 72241a4b83SYohann } 73241a4b83SYohann } 74241a4b83SYohann 75241a4b83SYohann template <int NCOMP, int P1d> 768795c945Sjeremylt inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77241a4b83SYohann if (data.tidx<P1d) 78241a4b83SYohann { 79*4d537eeaSYohann const CeedInt node = data.tidx; 80*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 81241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 82241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 83241a4b83SYohann } 84241a4b83SYohann } 85241a4b83SYohann } 86241a4b83SYohann 87241a4b83SYohann template <int NCOMP, int Q1d> 88241a4b83SYohann inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 89*4d537eeaSYohann const CeedInt node = data.tidx; 90*4d537eeaSYohann const CeedInt ind = node + elem * Q1d; 91241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 92241a4b83SYohann r_u[comp] = d_u[ind + nquads * comp]; 93241a4b83SYohann } 94241a4b83SYohann } 95241a4b83SYohann 96241a4b83SYohann template <int NCOMP, int Q1d> 97241a4b83SYohann inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 98*4d537eeaSYohann const CeedInt node = data.tidx; 99*4d537eeaSYohann const CeedInt ind = node + elem * Q1d; 100241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 101241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 102241a4b83SYohann } 103241a4b83SYohann } 104241a4b83SYohann 105241a4b83SYohann template <int NCOMP, int P1d> 1068795c945Sjeremylt inline __device__ void writeDofs1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 107241a4b83SYohann if (data.tidx<P1d) 108241a4b83SYohann { 109*4d537eeaSYohann const CeedInt node = data.tidx; 110*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 111241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 1128795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 113241a4b83SYohann } 114241a4b83SYohann } 115241a4b83SYohann } 116241a4b83SYohann 117241a4b83SYohann template <int NCOMP, int P1d> 1188795c945Sjeremylt inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 119241a4b83SYohann if (data.tidx<P1d) 120241a4b83SYohann { 121*4d537eeaSYohann const CeedInt node = data.tidx; 122*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d] : node + elem * P1d; 123241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 124241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 125241a4b83SYohann } 126241a4b83SYohann } 127241a4b83SYohann } 128241a4b83SYohann 129241a4b83SYohann template <int NCOMP, int Q1d> 130241a4b83SYohann inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 131*4d537eeaSYohann const CeedInt node = data.tidx; 132*4d537eeaSYohann const CeedInt ind = node + elem * Q1d; 133241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 134241a4b83SYohann d_v[ind + nquads * comp] = r_v[comp]; 135241a4b83SYohann } 136241a4b83SYohann } 137241a4b83SYohann 138241a4b83SYohann template <int NCOMP, int Q1d> 139241a4b83SYohann inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 140*4d537eeaSYohann const CeedInt node = data.tidx; 141*4d537eeaSYohann const CeedInt ind = node + elem * Q1d; 142241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 143241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[comp]; 144241a4b83SYohann } 145241a4b83SYohann } 146241a4b83SYohann 147241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 148241a4b83SYohann inline __device__ void ContractX1d(BackendData& data, 149241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 150241a4b83SYohann data.slice[data.tidx] = *U; 151241a4b83SYohann __syncthreads(); 152241a4b83SYohann *V = 0.0; 153241a4b83SYohann for (int i = 0; i < P1d; ++i) { 154241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 155241a4b83SYohann } 156241a4b83SYohann __syncthreads(); 157241a4b83SYohann } 158241a4b83SYohann 159241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 160241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data, 161241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 162241a4b83SYohann data.slice[data.tidx] = *U; 163241a4b83SYohann __syncthreads(); 164241a4b83SYohann *V = 0.0; 165241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 166241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 167241a4b83SYohann } 168241a4b83SYohann __syncthreads(); 169241a4b83SYohann } 170241a4b83SYohann 171241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 172241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 173241a4b83SYohann CeedScalar *__restrict__ r_V) { 174241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 175241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 176241a4b83SYohann } 177241a4b83SYohann } 178241a4b83SYohann 179241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 180241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 181241a4b83SYohann CeedScalar *__restrict__ r_V) { 182241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 183241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 184241a4b83SYohann } 185241a4b83SYohann } 186241a4b83SYohann 187241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 188241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 189241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 190241a4b83SYohann CeedScalar *__restrict__ r_V) { 191241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 192241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 193241a4b83SYohann } 194241a4b83SYohann } 195241a4b83SYohann 196241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 197241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 198241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 199241a4b83SYohann CeedScalar *__restrict__ r_V) { 200241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 201241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 202241a4b83SYohann } 203241a4b83SYohann } 204241a4b83SYohann 205241a4b83SYohann //**** 206241a4b83SYohann // 2D 207241a4b83SYohann template <int NCOMP, int P1d> 2088795c945Sjeremylt inline __device__ void readDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 209241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 210241a4b83SYohann { 211*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 212*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 213241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 2148795c945Sjeremylt r_u[comp] = d_u[ind + nnodes * comp]; 215241a4b83SYohann } 216241a4b83SYohann } 217241a4b83SYohann } 218241a4b83SYohann 219241a4b83SYohann template <int NCOMP, int P1d> 2208795c945Sjeremylt inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 221241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 222241a4b83SYohann { 223*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 224*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 225241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 226241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 227241a4b83SYohann } 228241a4b83SYohann } 229241a4b83SYohann } 230241a4b83SYohann 231241a4b83SYohann template <int NCOMP, int Q1d> 232241a4b83SYohann inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 233*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 234*4d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 235241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 236241a4b83SYohann r_u[comp] = d_u[ind + nquads * comp]; 237241a4b83SYohann } 238241a4b83SYohann } 239241a4b83SYohann 240241a4b83SYohann template <int NCOMP, int Q1d> 241241a4b83SYohann inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 242*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 243*4d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 244241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 245241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 246241a4b83SYohann } 247241a4b83SYohann } 248241a4b83SYohann 249241a4b83SYohann template <int NCOMP, int P1d> 2508795c945Sjeremylt inline __device__ void writeDofs2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 251241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 252241a4b83SYohann { 253*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 254*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 255241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 2568795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[comp]); 257241a4b83SYohann } 258241a4b83SYohann } 259241a4b83SYohann } 260241a4b83SYohann 261241a4b83SYohann template <int NCOMP, int P1d> 2628795c945Sjeremylt inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 263241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 264241a4b83SYohann { 265*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 266*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d] : node + elem * P1d*P1d; 267241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 268241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 269241a4b83SYohann } 270241a4b83SYohann } 271241a4b83SYohann } 272241a4b83SYohann 273241a4b83SYohann template <int NCOMP, int Q1d> 274241a4b83SYohann inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 275*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 276*4d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 277241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 278241a4b83SYohann d_v[ind + nquads * comp] = r_v[comp]; 279241a4b83SYohann } 280241a4b83SYohann } 281241a4b83SYohann 282241a4b83SYohann template <int NCOMP, int Q1d> 283241a4b83SYohann inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 284*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d; 285*4d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d; 286241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 287241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[comp]; 288241a4b83SYohann } 289241a4b83SYohann } 290241a4b83SYohann 291241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 292241a4b83SYohann inline __device__ void ContractX2d(BackendData& data, 293241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 294241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 295241a4b83SYohann __syncthreads(); 296241a4b83SYohann *V = 0.0; 297241a4b83SYohann for (int i = 0; i < P1d; ++i) { 298241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 299241a4b83SYohann } 300241a4b83SYohann __syncthreads(); 301241a4b83SYohann } 302241a4b83SYohann 303241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 304241a4b83SYohann inline __device__ void ContractY2d(BackendData& data, 305241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 306241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 307241a4b83SYohann __syncthreads(); 308241a4b83SYohann *V = 0.0; 309241a4b83SYohann for (int i = 0; i < P1d; ++i) { 310241a4b83SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 311241a4b83SYohann } 312241a4b83SYohann __syncthreads(); 313241a4b83SYohann } 314241a4b83SYohann 315241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 316241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data, 317241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 318241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 319241a4b83SYohann __syncthreads(); 320241a4b83SYohann *V = 0.0; 321241a4b83SYohann if (data.tidy<P1d) { 322241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 323241a4b83SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 324241a4b83SYohann } 325241a4b83SYohann } 326241a4b83SYohann __syncthreads(); 327241a4b83SYohann } 328241a4b83SYohann 329241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 330241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data, 331241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 332241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 333241a4b83SYohann __syncthreads(); 334241a4b83SYohann *V = 0.0; 335241a4b83SYohann if (data.tidx<P1d) { 336241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 337241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 338241a4b83SYohann } 339241a4b83SYohann } 340241a4b83SYohann __syncthreads(); 341241a4b83SYohann } 342241a4b83SYohann 343241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 344241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data, 345241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 346241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 347241a4b83SYohann __syncthreads(); 348241a4b83SYohann if (data.tidx<P1d) { 349241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 350241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 351241a4b83SYohann } 352241a4b83SYohann } 353241a4b83SYohann __syncthreads(); 354241a4b83SYohann } 355241a4b83SYohann 356241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 357241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 358241a4b83SYohann CeedScalar *__restrict__ r_V) { 359241a4b83SYohann CeedScalar r_t[1]; 360241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 361241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 362241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 363241a4b83SYohann } 364241a4b83SYohann } 365241a4b83SYohann 366241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 367241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 368241a4b83SYohann CeedScalar *__restrict__ r_V) { 369241a4b83SYohann CeedScalar r_t[1]; 370241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 371241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 372241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 373241a4b83SYohann } 374241a4b83SYohann } 375241a4b83SYohann 376241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 377241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 378241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 379241a4b83SYohann CeedScalar *__restrict__ r_V) { 380241a4b83SYohann CeedScalar r_t[1]; 381241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 382241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 383241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 384241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 385241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 386241a4b83SYohann } 387241a4b83SYohann } 388241a4b83SYohann 389241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 390241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 391241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 392241a4b83SYohann CeedScalar *__restrict__ r_V) { 393241a4b83SYohann CeedScalar r_t[1]; 394241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 395241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 396241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 397241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 398241a4b83SYohann ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 399241a4b83SYohann } 400241a4b83SYohann } 401241a4b83SYohann 402241a4b83SYohann //**** 403241a4b83SYohann // 3D 404241a4b83SYohann template <int NCOMP, int P1d> 4058795c945Sjeremylt inline __device__ void readDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 406241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 407241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 408*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 409*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 410241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 4118795c945Sjeremylt r_u[z+comp*P1d] = d_u[ind + nnodes * comp]; 412241a4b83SYohann } 413241a4b83SYohann } 414241a4b83SYohann } 415241a4b83SYohann } 416241a4b83SYohann 417241a4b83SYohann template <int NCOMP, int P1d> 4188795c945Sjeremylt inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 419241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 420241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 421*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 422*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 423241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 424241a4b83SYohann r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 425241a4b83SYohann } 426241a4b83SYohann } 427241a4b83SYohann } 428241a4b83SYohann } 429241a4b83SYohann 430241a4b83SYohann template <int NCOMP, int Q1d> 431241a4b83SYohann inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 432241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 433*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 434*4d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 435241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 436241a4b83SYohann r_u[z+comp*Q1d] = d_u[ind + nquads * comp]; 437241a4b83SYohann } 438241a4b83SYohann } 439241a4b83SYohann } 440241a4b83SYohann 441241a4b83SYohann template <int NCOMP, int Q1d> 442241a4b83SYohann inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 443241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 444*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 445*4d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 446241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 447241a4b83SYohann r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp]; 448241a4b83SYohann } 449241a4b83SYohann } 450241a4b83SYohann } 451241a4b83SYohann 452241a4b83SYohann template <int NCOMP, int P1d> 4538795c945Sjeremylt inline __device__ void writeDofs3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 454241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 455241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 456*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 457*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 458241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 4598795c945Sjeremylt atomicAdd(&d_v[ind + nnodes * comp], r_v[z+comp*P1d]); 460241a4b83SYohann } 461241a4b83SYohann } 462241a4b83SYohann } 463241a4b83SYohann } 464241a4b83SYohann 465241a4b83SYohann template <int NCOMP, int P1d> 4668795c945Sjeremylt inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 467241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 468241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 469*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 470*4d537eeaSYohann const CeedInt ind = indices ? indices[node + elem * P1d*P1d*P1d] : node + elem * P1d*P1d*P1d; 471241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 472241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 473241a4b83SYohann } 474241a4b83SYohann } 475241a4b83SYohann } 476241a4b83SYohann } 477241a4b83SYohann 478241a4b83SYohann template <int NCOMP, int Q1d> 479241a4b83SYohann inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 480241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 481*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 482*4d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 483241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 484241a4b83SYohann d_v[ind + nquads * comp] = r_v[z+comp*Q1d]; 485241a4b83SYohann } 486241a4b83SYohann } 487241a4b83SYohann } 488241a4b83SYohann 489241a4b83SYohann template <int NCOMP, int Q1d> 490241a4b83SYohann inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 491241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 492*4d537eeaSYohann const CeedInt node = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 493*4d537eeaSYohann const CeedInt ind = node + elem * Q1d*Q1d*Q1d; 494241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 495241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d]; 496241a4b83SYohann } 497241a4b83SYohann } 498241a4b83SYohann } 499241a4b83SYohann 500241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 501241a4b83SYohann inline __device__ void ContractX3d(BackendData& data, 502241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 503241a4b83SYohann for (int k = 0; k < P1d; ++k) { 504241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 505241a4b83SYohann __syncthreads(); 506241a4b83SYohann V[k] = 0.0; 507241a4b83SYohann for (int i = 0; i < P1d; ++i) { 508241a4b83SYohann V[k] += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 509241a4b83SYohann } 510241a4b83SYohann __syncthreads(); 511241a4b83SYohann } 512241a4b83SYohann } 513241a4b83SYohann 514241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 515241a4b83SYohann inline __device__ void ContractY3d(BackendData& data, 516241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 517241a4b83SYohann for (int k = 0; k < P1d; ++k) { 518241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 519241a4b83SYohann __syncthreads(); 520241a4b83SYohann V[k] = 0.0; 521241a4b83SYohann for (int i = 0; i < P1d; ++i) { 522241a4b83SYohann V[k] += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 523241a4b83SYohann } 524241a4b83SYohann __syncthreads(); 525241a4b83SYohann } 526241a4b83SYohann } 527241a4b83SYohann 528241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 529241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data, 530241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 531241a4b83SYohann for (int k = 0; k < Q1d; ++k) { 532241a4b83SYohann V[k] = 0.0; 533241a4b83SYohann for (int i = 0; i < P1d; ++i) { 534241a4b83SYohann V[k] += B[i + k*P1d] * U[i];//contract z direction 535241a4b83SYohann } 536241a4b83SYohann } 537241a4b83SYohann } 538241a4b83SYohann 539241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 540241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data, 541241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 542241a4b83SYohann for (int k = 0; k < Q1d; ++k) { 543241a4b83SYohann V[k] = 0.0; 544241a4b83SYohann if (k<P1d) { 545241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 546241a4b83SYohann V[k] += B[k + i*P1d] * U[i];//contract z direction 547241a4b83SYohann } 548241a4b83SYohann } 549241a4b83SYohann } 550241a4b83SYohann } 551241a4b83SYohann 552241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 553241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data, 554241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 555241a4b83SYohann for (int k = 0; k < P1d; ++k) { 556241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 557241a4b83SYohann __syncthreads(); 558241a4b83SYohann V[k] = 0.0; 559241a4b83SYohann if (data.tidy<P1d) { 560241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 561241a4b83SYohann V[k] += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 562241a4b83SYohann } 563241a4b83SYohann } 564241a4b83SYohann __syncthreads(); 565241a4b83SYohann } 566241a4b83SYohann } 567241a4b83SYohann 568241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 569241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data, 570241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 571241a4b83SYohann for (int k = 0; k < P1d; ++k) { 572241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 573241a4b83SYohann __syncthreads(); 574241a4b83SYohann V[k] = 0.0; 575241a4b83SYohann if (data.tidx<P1d) { 576241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 577241a4b83SYohann V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 578241a4b83SYohann } 579241a4b83SYohann } 580241a4b83SYohann __syncthreads(); 581241a4b83SYohann } 582241a4b83SYohann } 583241a4b83SYohann 584241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 585241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data, 586241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 587241a4b83SYohann for (int k = 0; k < P1d; ++k) { 588241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 589241a4b83SYohann __syncthreads(); 590241a4b83SYohann if (data.tidx<P1d) { 591241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 592241a4b83SYohann V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 593241a4b83SYohann } 594241a4b83SYohann } 595241a4b83SYohann __syncthreads(); 596241a4b83SYohann } 597241a4b83SYohann } 598241a4b83SYohann 599241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 600241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 601241a4b83SYohann CeedScalar *__restrict__ r_V) { 602241a4b83SYohann CeedScalar r_t1[Q1d]; 603241a4b83SYohann CeedScalar r_t2[Q1d]; 604241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 605241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 606241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 607241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 608241a4b83SYohann } 609241a4b83SYohann } 610241a4b83SYohann 611241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 612241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 613241a4b83SYohann CeedScalar *__restrict__ r_V) { 614241a4b83SYohann CeedScalar r_t1[Q1d]; 615241a4b83SYohann CeedScalar r_t2[Q1d]; 616241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 617241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 618241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 619241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 620241a4b83SYohann } 621241a4b83SYohann } 622241a4b83SYohann 623241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 624241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 625241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 626241a4b83SYohann CeedScalar *__restrict__ r_V) { 627241a4b83SYohann CeedScalar r_t1[Q1d]; 628241a4b83SYohann CeedScalar r_t2[Q1d]; 629241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 630241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 631241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 632241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 633241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 634241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 635241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 636241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 637241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 638241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 639241a4b83SYohann } 640241a4b83SYohann } 641241a4b83SYohann 642241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 643241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 644241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 645241a4b83SYohann CeedScalar *__restrict__ r_V) { 646241a4b83SYohann CeedScalar r_t1[Q1d]; 647241a4b83SYohann CeedScalar r_t2[Q1d]; 648241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 649241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 650241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 651241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 652241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 653241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 654241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 655241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 656241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 657241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 658241a4b83SYohann } 659241a4b83SYohann } 660241a4b83SYohann 661241a4b83SYohann template <int Q1d> 662241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 663241a4b83SYohann *w = qweight1d[data.tidx]; 664241a4b83SYohann } 665241a4b83SYohann 666241a4b83SYohann template <int Q1d> 667241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 668241a4b83SYohann *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 669241a4b83SYohann } 670241a4b83SYohann 671241a4b83SYohann template <int Q1d> 672241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 673241a4b83SYohann const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 674241a4b83SYohann for (int z = 0; z < Q1d; ++z) 675241a4b83SYohann { 676241a4b83SYohann w[z] = pw*qweight1d[z]; 677241a4b83SYohann } 678241a4b83SYohann } 679241a4b83SYohann 680241a4b83SYohann ); 681241a4b83SYohann 682241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 683241a4b83SYohann 684241a4b83SYohann using std::ostringstream; 685241a4b83SYohann using std::string; 686241a4b83SYohann int ierr; 687241a4b83SYohann bool setupdone; 688241a4b83SYohann ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 689241a4b83SYohann if (setupdone) return 0; 690241a4b83SYohann Ceed ceed; 691241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 692241a4b83SYohann CeedOperator_Cuda_gen *data; 693241a4b83SYohann ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 694241a4b83SYohann CeedQFunction qf; 695241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 696241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 697241a4b83SYohann ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 6988795c945Sjeremylt CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, nnodes; 699241a4b83SYohann ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 700241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 701241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 702241a4b83SYohann CeedChk(ierr); 703241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 704241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 705241a4b83SYohann CeedChk(ierr); 706241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 707241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 708241a4b83SYohann CeedChk(ierr); 709241a4b83SYohann CeedEvalMode emode; 710241a4b83SYohann CeedTransposeMode lmode; 711241a4b83SYohann CeedBasis basis; 712241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 713241a4b83SYohann CeedElemRestriction Erestrict; 714241a4b83SYohann CeedElemRestriction_Cuda_reg *restr_data; 715241a4b83SYohann 716241a4b83SYohann ostringstream code; 717241a4b83SYohann string devFunctions(deviceFunctions); 718241a4b83SYohann 719f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 720f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 721f1a13f77SYohann Dudouit Ceed delegate; 722f1a13f77SYohann Dudouit CeedGetDelegate(ceed, &delegate); 723f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 724f1a13f77SYohann Dudouit ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr); 725f1a13f77SYohann Dudouit ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 726f1a13f77SYohann Dudouit if(prop.major<6){ 727f1a13f77SYohann Dudouit code << atomicAdd; 728f1a13f77SYohann Dudouit } 729f1a13f77SYohann Dudouit 730241a4b83SYohann code << devFunctions; 731241a4b83SYohann 732241a4b83SYohann string qFunction(qf_data->qFunctionSource); 733*4d537eeaSYohann 734*4d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 735241a4b83SYohann code << qFunction; 736241a4b83SYohann 737241a4b83SYohann // Setup 738241a4b83SYohann code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 739241a4b83SYohann // Input Evecs and Restriction 740241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 741241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 742241a4b83SYohann CeedChk(ierr); 743241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 744241a4b83SYohann } else { 745241a4b83SYohann code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 746241a4b83SYohann if (emode != CEED_EVAL_NONE) 747241a4b83SYohann { 748241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 749241a4b83SYohann bool isTensor; 750241a4b83SYohann ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 751241a4b83SYohann //TODO check that all are the same 752241a4b83SYohann ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 753241a4b83SYohann if (isTensor) 754241a4b83SYohann { 755241a4b83SYohann //TODO check that all are the same 756241a4b83SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 757241a4b83SYohann } else { 758241a4b83SYohann return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 759241a4b83SYohann } 760241a4b83SYohann } 761241a4b83SYohann } 762241a4b83SYohann } 763241a4b83SYohann data->dim = dim; 764241a4b83SYohann data->Q1d = Q1d; 765241a4b83SYohann 766241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 767241a4b83SYohann code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 768241a4b83SYohann } 769241a4b83SYohann code << "const CeedInt Dim = "<<dim<<";\n"; 770241a4b83SYohann code << "const CeedInt Q1d = "<<Q1d<<";\n"; 771241a4b83SYohann // code << "const CeedInt Q = "<<Q<<";\n"; 772241a4b83SYohann code << "extern __shared__ CeedScalar slice[];\n"; 773241a4b83SYohann code << "BackendData data;\n"; 774241a4b83SYohann code << "data.tidx = threadIdx.x;\n"; 775241a4b83SYohann code << "data.tidy = threadIdx.y;\n"; 776241a4b83SYohann code << "data.tidz = threadIdx.z;\n"; 777241a4b83SYohann code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 778241a4b83SYohann code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 779241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 780241a4b83SYohann code << "// Input field "<<i<<"\n"; 781241a4b83SYohann // Get elemsize, emode, ncomp 782241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 783241a4b83SYohann CeedChk(ierr); 784241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 785241a4b83SYohann CeedChk(ierr); 786241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 787241a4b83SYohann CeedChk(ierr); 788*4d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 789241a4b83SYohann CeedChk(ierr); 790241a4b83SYohann // Basis action 791241a4b83SYohann switch (emode) { 792241a4b83SYohann case CEED_EVAL_NONE: 7938795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 794241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 7958795c945Sjeremylt code << " const CeedInt nquads_in_"<<i<<" = "<<nnodes<<";\n"; 796241a4b83SYohann break; 797241a4b83SYohann case CEED_EVAL_INTERP: 798241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 799241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 8008795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 801241a4b83SYohann code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 802241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 8038795c945Sjeremylt code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 804241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 805241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 806241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 807241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 808241a4b83SYohann break; 809241a4b83SYohann case CEED_EVAL_GRAD: 810241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 8118795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 812241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 8138795c945Sjeremylt code << " const CeedInt nnodes_in_"<<i<<" = "<<nnodes<<";\n"; 814241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 815241a4b83SYohann code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 816241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 817241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 818241a4b83SYohann data->G.in[i] = basis_data->d_grad1d; 819241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 820241a4b83SYohann code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 821241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 822241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 823241a4b83SYohann break; 824241a4b83SYohann case CEED_EVAL_WEIGHT: 825241a4b83SYohann break; // No action 826241a4b83SYohann case CEED_EVAL_DIV: 827241a4b83SYohann break; // TODO: Not implemented 828241a4b83SYohann case CEED_EVAL_CURL: 829241a4b83SYohann break; // TODO: Not implemented 830241a4b83SYohann } 831241a4b83SYohann } 832241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 833241a4b83SYohann code << "// Output field "<<i<<"\n"; 834241a4b83SYohann // Get elemsize, emode, ncomp 835241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 836241a4b83SYohann CeedChk(ierr); 837241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 838241a4b83SYohann CeedChk(ierr); 839241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 840241a4b83SYohann CeedChk(ierr); 841*4d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 842241a4b83SYohann CeedChk(ierr); 843241a4b83SYohann // Basis action 844241a4b83SYohann switch (emode) { 845241a4b83SYohann case CEED_EVAL_NONE: 846241a4b83SYohann code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 8478795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 848241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 8498795c945Sjeremylt code << " const CeedInt nquads_out_"<<i<<" = "<<nnodes<<";\n"; 850241a4b83SYohann break; // No action 851241a4b83SYohann case CEED_EVAL_INTERP: 852241a4b83SYohann code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 853241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 854241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 855241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 856241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 857241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 858241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 859241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 8608795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 8618795c945Sjeremylt code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 862241a4b83SYohann break; 863241a4b83SYohann case CEED_EVAL_GRAD: 864241a4b83SYohann code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 865241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 866241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 867241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 868241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 869241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 870241a4b83SYohann data->G.out[i] = basis_data->d_grad1d; 871241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 872241a4b83SYohann code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 873241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 874241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 8758795c945Sjeremylt ierr = CeedElemRestrictionGetNumNodes(Erestrict, &nnodes); CeedChk(ierr); 8768795c945Sjeremylt code << " const CeedInt nnodes_out_"<<i<<" = "<<nnodes<<";\n"; 877241a4b83SYohann break; 878241a4b83SYohann case CEED_EVAL_WEIGHT: { 879241a4b83SYohann Ceed ceed; 880241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 881241a4b83SYohann return CeedError(ceed, 1, 882241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 883241a4b83SYohann break; // Should not occur 884241a4b83SYohann } 885241a4b83SYohann case CEED_EVAL_DIV: 886241a4b83SYohann break; // TODO: Not implemented 887241a4b83SYohann case CEED_EVAL_CURL: 888241a4b83SYohann break; // TODO: Not implemented 889241a4b83SYohann } 890241a4b83SYohann } 891241a4b83SYohann code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 892241a4b83SYohann // Input basis apply if needed 893241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 894241a4b83SYohann code << "// Input field "<<i<<"\n"; 895241a4b83SYohann // Get elemsize, emode, ncomp 896241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 897241a4b83SYohann CeedChk(ierr); 898241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 899241a4b83SYohann CeedChk(ierr); 900241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 901241a4b83SYohann CeedChk(ierr); 902*4d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 903241a4b83SYohann CeedChk(ierr); 904241a4b83SYohann // Basis action 905241a4b83SYohann switch (emode) { 906241a4b83SYohann case CEED_EVAL_NONE: 907241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 908241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 909241a4b83SYohann code << " readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n"; 910241a4b83SYohann break; 911241a4b83SYohann case CEED_EVAL_INTERP: 912241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 913241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 914241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 915241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 9168795c945Sjeremylt code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 917241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 918241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 919241a4b83SYohann break; 920241a4b83SYohann case CEED_EVAL_GRAD: 921241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 922241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 923241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 924241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 9258795c945Sjeremylt code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, nnodes_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 926241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 927241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 928241a4b83SYohann break; 929241a4b83SYohann case CEED_EVAL_WEIGHT: 930241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 931241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 932241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 933241a4b83SYohann data->W = basis_data->d_qweight1d; 934241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 935241a4b83SYohann break; // No action 936241a4b83SYohann case CEED_EVAL_DIV: 937241a4b83SYohann break; // TODO: Not implemented 938241a4b83SYohann case CEED_EVAL_CURL: 939241a4b83SYohann break; // TODO: Not implemented 940241a4b83SYohann } 941241a4b83SYohann } 942241a4b83SYohann // Q function 943241a4b83SYohann code << "// QFunction\n"; 944241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 945241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 946241a4b83SYohann CeedChk(ierr); 947241a4b83SYohann if (emode==CEED_EVAL_GRAD) 948241a4b83SYohann { 949241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 950241a4b83SYohann } 951241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 952241a4b83SYohann { 953241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 954241a4b83SYohann } 955241a4b83SYohann } 956*4d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 957*4d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 958*4d537eeaSYohann code << " in["<<i<<"] = r_t"<<i<<";\n"; 959*4d537eeaSYohann } 960*4d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 961*4d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 962*4d537eeaSYohann code << " out["<<i<<"] = r_tt"<<i<<";\n"; 963*4d537eeaSYohann } 964241a4b83SYohann string qFunctionName(qf_data->qFunctionName); 965241a4b83SYohann code << " "<<qFunctionName<<"(ctx, "<<(dim==3?"Q1d":"1")<<", "; 966*4d537eeaSYohann code << "in, out"; 967241a4b83SYohann code << ");\n"; 968241a4b83SYohann 969241a4b83SYohann // Output basis apply if needed 970241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 971241a4b83SYohann code << "// Output field "<<i<<"\n"; 972241a4b83SYohann // Get elemsize, emode, ncomp 973241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 974241a4b83SYohann CeedChk(ierr); 975241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 976241a4b83SYohann CeedChk(ierr); 977241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 978241a4b83SYohann CeedChk(ierr); 979*4d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 980241a4b83SYohann CeedChk(ierr); 981241a4b83SYohann // Basis action 982241a4b83SYohann switch (emode) { 983241a4b83SYohann case CEED_EVAL_NONE: 984241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 985241a4b83SYohann code << " writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n"; 986241a4b83SYohann break; // No action 987241a4b83SYohann case CEED_EVAL_INTERP: 988241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 989241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 990241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 991241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 992241a4b83SYohann data->indices.out[i] = restr_data->d_ind; 9938795c945Sjeremylt code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 994241a4b83SYohann break; 995241a4b83SYohann case CEED_EVAL_GRAD: 996241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 997241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 998241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 999241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1000241a4b83SYohann data->indices.out[i] = restr_data->d_ind; 10018795c945Sjeremylt code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, nnodes_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1002241a4b83SYohann break; 1003241a4b83SYohann case CEED_EVAL_WEIGHT: { 1004241a4b83SYohann Ceed ceed; 1005241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1006241a4b83SYohann return CeedError(ceed, 1, 1007241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1008241a4b83SYohann break; // Should not occur 1009241a4b83SYohann } 1010241a4b83SYohann case CEED_EVAL_DIV: 1011241a4b83SYohann break; // TODO: Not implemented 1012241a4b83SYohann case CEED_EVAL_CURL: 1013241a4b83SYohann break; // TODO: Not implemented 1014241a4b83SYohann } 1015241a4b83SYohann } 1016241a4b83SYohann 1017241a4b83SYohann code << " }\n"; 1018241a4b83SYohann code << "}\n\n"; 1019241a4b83SYohann 1020241a4b83SYohann // std::cout << code.str(); 1021241a4b83SYohann 1022241a4b83SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr); 1023241a4b83SYohann ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1024241a4b83SYohann CeedChk(ierr); 1025241a4b83SYohann 1026241a4b83SYohann ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1027241a4b83SYohann 1028241a4b83SYohann return 0; 1029241a4b83SYohann } 1030