1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16241a4b83SYohann #include <ceed-backend.h> 17241a4b83SYohann #include "ceed-cuda-gen.h" 18241a4b83SYohann #include <iostream> 19241a4b83SYohann #include <sstream> 20241a4b83SYohann #include "../cuda/ceed-cuda.h" 21241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h" 22241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 23241a4b83SYohann 24*f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 25241a4b83SYohann __device__ double atomicAdd(double *address, double val) { 26241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 27241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 28241a4b83SYohann do { 29241a4b83SYohann assumed = old; 30241a4b83SYohann old = 31241a4b83SYohann atomicCAS(address_as_ull, assumed, 32241a4b83SYohann __double_as_longlong(val + 33241a4b83SYohann __longlong_as_double(assumed))); 34241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 35241a4b83SYohann // (since NaN != NaN) 36241a4b83SYohann } while (assumed != old); 37241a4b83SYohann return __longlong_as_double(old); 38241a4b83SYohann } 39*f1a13f77SYohann Dudouit ); 40*f1a13f77SYohann Dudouit 41*f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 42*f1a13f77SYohann Dudouit 43*f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 44*f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 45*f1a13f77SYohann Dudouit 46*f1a13f77SYohann Dudouit typedef struct { 47*f1a13f77SYohann Dudouit CeedInt tidx; 48*f1a13f77SYohann Dudouit CeedInt tidy; 49*f1a13f77SYohann Dudouit CeedInt tidz; 50*f1a13f77SYohann Dudouit CeedInt tid; 51*f1a13f77SYohann Dudouit CeedScalar* slice; 52*f1a13f77SYohann Dudouit } BackendData; 53241a4b83SYohann 54241a4b83SYohann template <int P, int Q> 55241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56241a4b83SYohann for(int i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57241a4b83SYohann B[i] = d_B[i]; 58241a4b83SYohann } 59241a4b83SYohann __syncthreads; 60241a4b83SYohann } 61241a4b83SYohann 62241a4b83SYohann //**** 63241a4b83SYohann // 1D 64241a4b83SYohann template <int NCOMP, int P1d> 65241a4b83SYohann inline __device__ void readDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 66241a4b83SYohann if (data.tidx<P1d) 67241a4b83SYohann { 68241a4b83SYohann const CeedInt dof = data.tidx; 69241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 70241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 71241a4b83SYohann r_u[comp] = d_u[ind + ndofs * comp]; 72241a4b83SYohann } 73241a4b83SYohann } 74241a4b83SYohann } 75241a4b83SYohann 76241a4b83SYohann template <int NCOMP, int P1d> 77241a4b83SYohann inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 78241a4b83SYohann if (data.tidx<P1d) 79241a4b83SYohann { 80241a4b83SYohann const CeedInt dof = data.tidx; 81241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 82241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 83241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 84241a4b83SYohann } 85241a4b83SYohann } 86241a4b83SYohann } 87241a4b83SYohann 88241a4b83SYohann template <int NCOMP, int Q1d> 89241a4b83SYohann inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 90241a4b83SYohann const CeedInt dof = data.tidx; 91241a4b83SYohann const CeedInt ind = dof + elem * Q1d; 92241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 93241a4b83SYohann r_u[comp] = d_u[ind + nquads * comp]; 94241a4b83SYohann } 95241a4b83SYohann } 96241a4b83SYohann 97241a4b83SYohann template <int NCOMP, int Q1d> 98241a4b83SYohann inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 99241a4b83SYohann const CeedInt dof = data.tidx; 100241a4b83SYohann const CeedInt ind = dof + elem * Q1d; 101241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 102241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 103241a4b83SYohann } 104241a4b83SYohann } 105241a4b83SYohann 106241a4b83SYohann template <int NCOMP, int P1d> 107241a4b83SYohann inline __device__ void writeDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 108241a4b83SYohann if (data.tidx<P1d) 109241a4b83SYohann { 110241a4b83SYohann const CeedInt dof = data.tidx; 111241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 112241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 113241a4b83SYohann atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]); 114241a4b83SYohann } 115241a4b83SYohann } 116241a4b83SYohann } 117241a4b83SYohann 118241a4b83SYohann template <int NCOMP, int P1d> 119241a4b83SYohann inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 120241a4b83SYohann if (data.tidx<P1d) 121241a4b83SYohann { 122241a4b83SYohann const CeedInt dof = data.tidx; 123241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 124241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 125241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 126241a4b83SYohann } 127241a4b83SYohann } 128241a4b83SYohann } 129241a4b83SYohann 130241a4b83SYohann template <int NCOMP, int Q1d> 131241a4b83SYohann inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 132241a4b83SYohann const CeedInt dof = data.tidx; 133241a4b83SYohann const CeedInt ind = dof + elem * Q1d; 134241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 135241a4b83SYohann d_v[ind + nquads * comp] = r_v[comp]; 136241a4b83SYohann } 137241a4b83SYohann } 138241a4b83SYohann 139241a4b83SYohann template <int NCOMP, int Q1d> 140241a4b83SYohann inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 141241a4b83SYohann const CeedInt dof = data.tidx; 142241a4b83SYohann const CeedInt ind = dof + elem * Q1d; 143241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 144241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[comp]; 145241a4b83SYohann } 146241a4b83SYohann } 147241a4b83SYohann 148241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 149241a4b83SYohann inline __device__ void ContractX1d(BackendData& data, 150241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 151241a4b83SYohann data.slice[data.tidx] = *U; 152241a4b83SYohann __syncthreads(); 153241a4b83SYohann *V = 0.0; 154241a4b83SYohann for (int i = 0; i < P1d; ++i) { 155241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 156241a4b83SYohann } 157241a4b83SYohann __syncthreads(); 158241a4b83SYohann } 159241a4b83SYohann 160241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 161241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data, 162241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 163241a4b83SYohann data.slice[data.tidx] = *U; 164241a4b83SYohann __syncthreads(); 165241a4b83SYohann *V = 0.0; 166241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 167241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 168241a4b83SYohann } 169241a4b83SYohann __syncthreads(); 170241a4b83SYohann } 171241a4b83SYohann 172241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 173241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 174241a4b83SYohann CeedScalar *__restrict__ r_V) { 175241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 176241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 177241a4b83SYohann } 178241a4b83SYohann } 179241a4b83SYohann 180241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 181241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 182241a4b83SYohann CeedScalar *__restrict__ r_V) { 183241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 184241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 185241a4b83SYohann } 186241a4b83SYohann } 187241a4b83SYohann 188241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 189241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 190241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 191241a4b83SYohann CeedScalar *__restrict__ r_V) { 192241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 193241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 194241a4b83SYohann } 195241a4b83SYohann } 196241a4b83SYohann 197241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 198241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 199241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 200241a4b83SYohann CeedScalar *__restrict__ r_V) { 201241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 202241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 203241a4b83SYohann } 204241a4b83SYohann } 205241a4b83SYohann 206241a4b83SYohann //**** 207241a4b83SYohann // 2D 208241a4b83SYohann template <int NCOMP, int P1d> 209241a4b83SYohann inline __device__ void readDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 210241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 211241a4b83SYohann { 212241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d; 213241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 214241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 215241a4b83SYohann r_u[comp] = d_u[ind + ndofs * comp]; 216241a4b83SYohann } 217241a4b83SYohann } 218241a4b83SYohann } 219241a4b83SYohann 220241a4b83SYohann template <int NCOMP, int P1d> 221241a4b83SYohann inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 222241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 223241a4b83SYohann { 224241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d; 225241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 226241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 227241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 228241a4b83SYohann } 229241a4b83SYohann } 230241a4b83SYohann } 231241a4b83SYohann 232241a4b83SYohann template <int NCOMP, int Q1d> 233241a4b83SYohann inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 234241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d; 235241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d; 236241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 237241a4b83SYohann r_u[comp] = d_u[ind + nquads * comp]; 238241a4b83SYohann } 239241a4b83SYohann } 240241a4b83SYohann 241241a4b83SYohann template <int NCOMP, int Q1d> 242241a4b83SYohann inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 243241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d; 244241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d; 245241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 246241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 247241a4b83SYohann } 248241a4b83SYohann } 249241a4b83SYohann 250241a4b83SYohann template <int NCOMP, int P1d> 251241a4b83SYohann inline __device__ void writeDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 252241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 253241a4b83SYohann { 254241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d; 255241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 256241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 257241a4b83SYohann atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]); 258241a4b83SYohann } 259241a4b83SYohann } 260241a4b83SYohann } 261241a4b83SYohann 262241a4b83SYohann template <int NCOMP, int P1d> 263241a4b83SYohann inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 264241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 265241a4b83SYohann { 266241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d; 267241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 268241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 269241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 270241a4b83SYohann } 271241a4b83SYohann } 272241a4b83SYohann } 273241a4b83SYohann 274241a4b83SYohann template <int NCOMP, int Q1d> 275241a4b83SYohann inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 276241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d; 277241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d; 278241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 279241a4b83SYohann d_v[ind + nquads * comp] = r_v[comp]; 280241a4b83SYohann } 281241a4b83SYohann } 282241a4b83SYohann 283241a4b83SYohann template <int NCOMP, int Q1d> 284241a4b83SYohann inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 285241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d; 286241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d; 287241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 288241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[comp]; 289241a4b83SYohann } 290241a4b83SYohann } 291241a4b83SYohann 292241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 293241a4b83SYohann inline __device__ void ContractX2d(BackendData& data, 294241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 295241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 296241a4b83SYohann __syncthreads(); 297241a4b83SYohann *V = 0.0; 298241a4b83SYohann for (int i = 0; i < P1d; ++i) { 299241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 300241a4b83SYohann } 301241a4b83SYohann __syncthreads(); 302241a4b83SYohann } 303241a4b83SYohann 304241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 305241a4b83SYohann inline __device__ void ContractY2d(BackendData& data, 306241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 307241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 308241a4b83SYohann __syncthreads(); 309241a4b83SYohann *V = 0.0; 310241a4b83SYohann for (int i = 0; i < P1d; ++i) { 311241a4b83SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 312241a4b83SYohann } 313241a4b83SYohann __syncthreads(); 314241a4b83SYohann } 315241a4b83SYohann 316241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 317241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data, 318241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 319241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 320241a4b83SYohann __syncthreads(); 321241a4b83SYohann *V = 0.0; 322241a4b83SYohann if (data.tidy<P1d) { 323241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 324241a4b83SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 325241a4b83SYohann } 326241a4b83SYohann } 327241a4b83SYohann __syncthreads(); 328241a4b83SYohann } 329241a4b83SYohann 330241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 331241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data, 332241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 333241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 334241a4b83SYohann __syncthreads(); 335241a4b83SYohann *V = 0.0; 336241a4b83SYohann if (data.tidx<P1d) { 337241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 338241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 339241a4b83SYohann } 340241a4b83SYohann } 341241a4b83SYohann __syncthreads(); 342241a4b83SYohann } 343241a4b83SYohann 344241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 345241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data, 346241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 347241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 348241a4b83SYohann __syncthreads(); 349241a4b83SYohann if (data.tidx<P1d) { 350241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 351241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 352241a4b83SYohann } 353241a4b83SYohann } 354241a4b83SYohann __syncthreads(); 355241a4b83SYohann } 356241a4b83SYohann 357241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 358241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 359241a4b83SYohann CeedScalar *__restrict__ r_V) { 360241a4b83SYohann CeedScalar r_t[1]; 361241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 362241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 363241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 364241a4b83SYohann } 365241a4b83SYohann } 366241a4b83SYohann 367241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 368241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 369241a4b83SYohann CeedScalar *__restrict__ r_V) { 370241a4b83SYohann CeedScalar r_t[1]; 371241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 372241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 373241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 374241a4b83SYohann } 375241a4b83SYohann } 376241a4b83SYohann 377241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 378241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 379241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 380241a4b83SYohann CeedScalar *__restrict__ r_V) { 381241a4b83SYohann CeedScalar r_t[1]; 382241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 383241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 384241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 385241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 386241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 387241a4b83SYohann } 388241a4b83SYohann } 389241a4b83SYohann 390241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 391241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 392241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 393241a4b83SYohann CeedScalar *__restrict__ r_V) { 394241a4b83SYohann CeedScalar r_t[1]; 395241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 396241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 397241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 398241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 399241a4b83SYohann ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 400241a4b83SYohann } 401241a4b83SYohann } 402241a4b83SYohann 403241a4b83SYohann //**** 404241a4b83SYohann // 3D 405241a4b83SYohann template <int NCOMP, int P1d> 406241a4b83SYohann inline __device__ void readDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 407241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 408241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 409241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 410241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 411241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 412241a4b83SYohann r_u[z+comp*P1d] = d_u[ind + ndofs * comp]; 413241a4b83SYohann } 414241a4b83SYohann } 415241a4b83SYohann } 416241a4b83SYohann } 417241a4b83SYohann 418241a4b83SYohann template <int NCOMP, int P1d> 419241a4b83SYohann inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 420241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 421241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 422241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 423241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 424241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 425241a4b83SYohann r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 426241a4b83SYohann } 427241a4b83SYohann } 428241a4b83SYohann } 429241a4b83SYohann } 430241a4b83SYohann 431241a4b83SYohann template <int NCOMP, int Q1d> 432241a4b83SYohann inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 433241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 434241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 435241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 436241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 437241a4b83SYohann r_u[z+comp*Q1d] = d_u[ind + nquads * comp]; 438241a4b83SYohann } 439241a4b83SYohann } 440241a4b83SYohann } 441241a4b83SYohann 442241a4b83SYohann template <int NCOMP, int Q1d> 443241a4b83SYohann inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 444241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 445241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 446241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 447241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 448241a4b83SYohann r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp]; 449241a4b83SYohann } 450241a4b83SYohann } 451241a4b83SYohann } 452241a4b83SYohann 453241a4b83SYohann template <int NCOMP, int P1d> 454241a4b83SYohann inline __device__ void writeDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 455241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 456241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 457241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 458241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 459241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 460241a4b83SYohann atomicAdd(&d_v[ind + ndofs * comp], r_v[z+comp*P1d]); 461241a4b83SYohann } 462241a4b83SYohann } 463241a4b83SYohann } 464241a4b83SYohann } 465241a4b83SYohann 466241a4b83SYohann template <int NCOMP, int P1d> 467241a4b83SYohann inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 468241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 469241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 470241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 471241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 472241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 473241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 474241a4b83SYohann } 475241a4b83SYohann } 476241a4b83SYohann } 477241a4b83SYohann } 478241a4b83SYohann 479241a4b83SYohann template <int NCOMP, int Q1d> 480241a4b83SYohann inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 481241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 482241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 483241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 484241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 485241a4b83SYohann d_v[ind + nquads * comp] = r_v[z+comp*Q1d]; 486241a4b83SYohann } 487241a4b83SYohann } 488241a4b83SYohann } 489241a4b83SYohann 490241a4b83SYohann template <int NCOMP, int Q1d> 491241a4b83SYohann inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 492241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 493241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 494241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 495241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 496241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d]; 497241a4b83SYohann } 498241a4b83SYohann } 499241a4b83SYohann } 500241a4b83SYohann 501241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 502241a4b83SYohann inline __device__ void ContractX3d(BackendData& data, 503241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 504241a4b83SYohann for (int k = 0; k < P1d; ++k) { 505241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 506241a4b83SYohann __syncthreads(); 507241a4b83SYohann V[k] = 0.0; 508241a4b83SYohann for (int i = 0; i < P1d; ++i) { 509241a4b83SYohann V[k] += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 510241a4b83SYohann } 511241a4b83SYohann __syncthreads(); 512241a4b83SYohann } 513241a4b83SYohann } 514241a4b83SYohann 515241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 516241a4b83SYohann inline __device__ void ContractY3d(BackendData& data, 517241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 518241a4b83SYohann for (int k = 0; k < P1d; ++k) { 519241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 520241a4b83SYohann __syncthreads(); 521241a4b83SYohann V[k] = 0.0; 522241a4b83SYohann for (int i = 0; i < P1d; ++i) { 523241a4b83SYohann V[k] += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 524241a4b83SYohann } 525241a4b83SYohann __syncthreads(); 526241a4b83SYohann } 527241a4b83SYohann } 528241a4b83SYohann 529241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 530241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data, 531241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 532241a4b83SYohann for (int k = 0; k < Q1d; ++k) { 533241a4b83SYohann V[k] = 0.0; 534241a4b83SYohann for (int i = 0; i < P1d; ++i) { 535241a4b83SYohann V[k] += B[i + k*P1d] * U[i];//contract z direction 536241a4b83SYohann } 537241a4b83SYohann } 538241a4b83SYohann } 539241a4b83SYohann 540241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 541241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data, 542241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 543241a4b83SYohann for (int k = 0; k < Q1d; ++k) { 544241a4b83SYohann V[k] = 0.0; 545241a4b83SYohann if (k<P1d) { 546241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 547241a4b83SYohann V[k] += B[k + i*P1d] * U[i];//contract z direction 548241a4b83SYohann } 549241a4b83SYohann } 550241a4b83SYohann } 551241a4b83SYohann } 552241a4b83SYohann 553241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 554241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data, 555241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 556241a4b83SYohann for (int k = 0; k < P1d; ++k) { 557241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 558241a4b83SYohann __syncthreads(); 559241a4b83SYohann V[k] = 0.0; 560241a4b83SYohann if (data.tidy<P1d) { 561241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 562241a4b83SYohann V[k] += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 563241a4b83SYohann } 564241a4b83SYohann } 565241a4b83SYohann __syncthreads(); 566241a4b83SYohann } 567241a4b83SYohann } 568241a4b83SYohann 569241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 570241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data, 571241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 572241a4b83SYohann for (int k = 0; k < P1d; ++k) { 573241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 574241a4b83SYohann __syncthreads(); 575241a4b83SYohann V[k] = 0.0; 576241a4b83SYohann if (data.tidx<P1d) { 577241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 578241a4b83SYohann V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 579241a4b83SYohann } 580241a4b83SYohann } 581241a4b83SYohann __syncthreads(); 582241a4b83SYohann } 583241a4b83SYohann } 584241a4b83SYohann 585241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 586241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data, 587241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 588241a4b83SYohann for (int k = 0; k < P1d; ++k) { 589241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 590241a4b83SYohann __syncthreads(); 591241a4b83SYohann if (data.tidx<P1d) { 592241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 593241a4b83SYohann V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 594241a4b83SYohann } 595241a4b83SYohann } 596241a4b83SYohann __syncthreads(); 597241a4b83SYohann } 598241a4b83SYohann } 599241a4b83SYohann 600241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 601241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 602241a4b83SYohann CeedScalar *__restrict__ r_V) { 603241a4b83SYohann CeedScalar r_t1[Q1d]; 604241a4b83SYohann CeedScalar r_t2[Q1d]; 605241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 606241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 607241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 608241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 609241a4b83SYohann } 610241a4b83SYohann } 611241a4b83SYohann 612241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 613241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 614241a4b83SYohann CeedScalar *__restrict__ r_V) { 615241a4b83SYohann CeedScalar r_t1[Q1d]; 616241a4b83SYohann CeedScalar r_t2[Q1d]; 617241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 618241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 619241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 620241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 621241a4b83SYohann } 622241a4b83SYohann } 623241a4b83SYohann 624241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 625241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 626241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 627241a4b83SYohann CeedScalar *__restrict__ r_V) { 628241a4b83SYohann CeedScalar r_t1[Q1d]; 629241a4b83SYohann CeedScalar r_t2[Q1d]; 630241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 631241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 632241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 633241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 634241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 635241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 636241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 637241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 638241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 639241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 640241a4b83SYohann } 641241a4b83SYohann } 642241a4b83SYohann 643241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 644241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 645241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 646241a4b83SYohann CeedScalar *__restrict__ r_V) { 647241a4b83SYohann CeedScalar r_t1[Q1d]; 648241a4b83SYohann CeedScalar r_t2[Q1d]; 649241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 650241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 651241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 652241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 653241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 654241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 655241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 656241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 657241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 658241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 659241a4b83SYohann } 660241a4b83SYohann } 661241a4b83SYohann 662241a4b83SYohann template <int Q1d> 663241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 664241a4b83SYohann *w = qweight1d[data.tidx]; 665241a4b83SYohann } 666241a4b83SYohann 667241a4b83SYohann template <int Q1d> 668241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 669241a4b83SYohann *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 670241a4b83SYohann } 671241a4b83SYohann 672241a4b83SYohann template <int Q1d> 673241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 674241a4b83SYohann const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 675241a4b83SYohann for (int z = 0; z < Q1d; ++z) 676241a4b83SYohann { 677241a4b83SYohann w[z] = pw*qweight1d[z]; 678241a4b83SYohann } 679241a4b83SYohann } 680241a4b83SYohann 681241a4b83SYohann ); 682241a4b83SYohann 683241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 684241a4b83SYohann 685241a4b83SYohann using std::ostringstream; 686241a4b83SYohann using std::string; 687241a4b83SYohann int ierr; 688241a4b83SYohann bool setupdone; 689241a4b83SYohann ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 690241a4b83SYohann if (setupdone) return 0; 691241a4b83SYohann Ceed ceed; 692241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 693241a4b83SYohann CeedOperator_Cuda_gen *data; 694241a4b83SYohann ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 695241a4b83SYohann CeedQFunction qf; 696241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 697241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 698241a4b83SYohann ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 699241a4b83SYohann CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, ndof; 700241a4b83SYohann ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 701241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 702241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 703241a4b83SYohann CeedChk(ierr); 704241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 705241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 706241a4b83SYohann CeedChk(ierr); 707241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 708241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 709241a4b83SYohann CeedChk(ierr); 710241a4b83SYohann CeedEvalMode emode; 711241a4b83SYohann CeedTransposeMode lmode; 712241a4b83SYohann CeedBasis basis; 713241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 714241a4b83SYohann CeedElemRestriction Erestrict; 715241a4b83SYohann CeedElemRestriction_Cuda_reg *restr_data; 716241a4b83SYohann 717241a4b83SYohann ostringstream code; 718241a4b83SYohann string devFunctions(deviceFunctions); 719241a4b83SYohann 720*f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 721*f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 722*f1a13f77SYohann Dudouit Ceed delegate; 723*f1a13f77SYohann Dudouit CeedGetDelegate(ceed, &delegate); 724*f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 725*f1a13f77SYohann Dudouit ierr = CeedGetData(delegate, (void **)&ceed_data); CeedChk(ierr); 726*f1a13f77SYohann Dudouit ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 727*f1a13f77SYohann Dudouit if(prop.major<6){ 728*f1a13f77SYohann Dudouit std::cout<< "I'm an old GPU" << std::endl; 729*f1a13f77SYohann Dudouit code << atomicAdd; 730*f1a13f77SYohann Dudouit } 731*f1a13f77SYohann Dudouit 732241a4b83SYohann code << devFunctions; 733241a4b83SYohann 734241a4b83SYohann string qFunction(qf_data->qFunctionSource); 735241a4b83SYohann code << qFunction; 736241a4b83SYohann 737241a4b83SYohann // Setup 738241a4b83SYohann code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 739241a4b83SYohann // Input Evecs and Restriction 740241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 741241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 742241a4b83SYohann CeedChk(ierr); 743241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 744241a4b83SYohann } else { 745241a4b83SYohann code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 746241a4b83SYohann if (emode != CEED_EVAL_NONE) 747241a4b83SYohann { 748241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 749241a4b83SYohann bool isTensor; 750241a4b83SYohann ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 751241a4b83SYohann //TODO check that all are the same 752241a4b83SYohann ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 753241a4b83SYohann if (isTensor) 754241a4b83SYohann { 755241a4b83SYohann //TODO check that all are the same 756241a4b83SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 757241a4b83SYohann } else { 758241a4b83SYohann return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 759241a4b83SYohann } 760241a4b83SYohann } 761241a4b83SYohann } 762241a4b83SYohann } 763241a4b83SYohann data->dim = dim; 764241a4b83SYohann data->Q1d = Q1d; 765241a4b83SYohann 766241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 767241a4b83SYohann code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 768241a4b83SYohann } 769241a4b83SYohann code << "const CeedInt Dim = "<<dim<<";\n"; 770241a4b83SYohann code << "const CeedInt Q1d = "<<Q1d<<";\n"; 771241a4b83SYohann // code << "const CeedInt Q = "<<Q<<";\n"; 772241a4b83SYohann code << "extern __shared__ CeedScalar slice[];\n"; 773241a4b83SYohann code << "BackendData data;\n"; 774241a4b83SYohann code << "data.tidx = threadIdx.x;\n"; 775241a4b83SYohann code << "data.tidy = threadIdx.y;\n"; 776241a4b83SYohann code << "data.tidz = threadIdx.z;\n"; 777241a4b83SYohann code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 778241a4b83SYohann code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 779241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 780241a4b83SYohann code << "// Input field "<<i<<"\n"; 781241a4b83SYohann // Get elemsize, emode, ncomp 782241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 783241a4b83SYohann CeedChk(ierr); 784241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 785241a4b83SYohann CeedChk(ierr); 786241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 787241a4b83SYohann CeedChk(ierr); 788241a4b83SYohann ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp); 789241a4b83SYohann CeedChk(ierr); 790241a4b83SYohann // Basis action 791241a4b83SYohann switch (emode) { 792241a4b83SYohann case CEED_EVAL_NONE: 793241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 794241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 795241a4b83SYohann code << " const CeedInt nquads_in_"<<i<<" = "<<ndof/ncomp<<";\n"; 796241a4b83SYohann break; 797241a4b83SYohann case CEED_EVAL_INTERP: 798241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 799241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 800241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 801241a4b83SYohann code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 802241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 803241a4b83SYohann code << " const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n"; 804241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 805241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 806241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 807241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 808241a4b83SYohann break; 809241a4b83SYohann case CEED_EVAL_GRAD: 810241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 811241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 812241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 813241a4b83SYohann code << " const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n"; 814241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 815241a4b83SYohann code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 816241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 817241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 818241a4b83SYohann data->G.in[i] = basis_data->d_grad1d; 819241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 820241a4b83SYohann code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 821241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 822241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 823241a4b83SYohann break; 824241a4b83SYohann case CEED_EVAL_WEIGHT: 825241a4b83SYohann break; // No action 826241a4b83SYohann case CEED_EVAL_DIV: 827241a4b83SYohann break; // TODO: Not implemented 828241a4b83SYohann case CEED_EVAL_CURL: 829241a4b83SYohann break; // TODO: Not implemented 830241a4b83SYohann } 831241a4b83SYohann } 832241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 833241a4b83SYohann code << "// Output field "<<i<<"\n"; 834241a4b83SYohann // Get elemsize, emode, ncomp 835241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 836241a4b83SYohann CeedChk(ierr); 837241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 838241a4b83SYohann CeedChk(ierr); 839241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 840241a4b83SYohann CeedChk(ierr); 841241a4b83SYohann ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp); 842241a4b83SYohann CeedChk(ierr); 843241a4b83SYohann // Basis action 844241a4b83SYohann switch (emode) { 845241a4b83SYohann case CEED_EVAL_NONE: 846241a4b83SYohann code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 847241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 848241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 849241a4b83SYohann code << " const CeedInt nquads_out_"<<i<<" = "<<ndof<<"/ncomp_out_"<<i<<";\n"; 850241a4b83SYohann break; // No action 851241a4b83SYohann case CEED_EVAL_INTERP: 852241a4b83SYohann code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 853241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 854241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 855241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 856241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 857241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 858241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 859241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 860241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 861241a4b83SYohann code << " const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n"; 862241a4b83SYohann break; 863241a4b83SYohann case CEED_EVAL_GRAD: 864241a4b83SYohann code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 865241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 866241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 867241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 868241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 869241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 870241a4b83SYohann data->G.out[i] = basis_data->d_grad1d; 871241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 872241a4b83SYohann code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 873241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 874241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 875241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 876241a4b83SYohann code << " const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n"; 877241a4b83SYohann break; 878241a4b83SYohann case CEED_EVAL_WEIGHT: { 879241a4b83SYohann Ceed ceed; 880241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 881241a4b83SYohann return CeedError(ceed, 1, 882241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 883241a4b83SYohann break; // Should not occur 884241a4b83SYohann } 885241a4b83SYohann case CEED_EVAL_DIV: 886241a4b83SYohann break; // TODO: Not implemented 887241a4b83SYohann case CEED_EVAL_CURL: 888241a4b83SYohann break; // TODO: Not implemented 889241a4b83SYohann } 890241a4b83SYohann } 891241a4b83SYohann code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 892241a4b83SYohann // Input basis apply if needed 893241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 894241a4b83SYohann code << "// Input field "<<i<<"\n"; 895241a4b83SYohann // Get elemsize, emode, ncomp 896241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 897241a4b83SYohann CeedChk(ierr); 898241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 899241a4b83SYohann CeedChk(ierr); 900241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 901241a4b83SYohann CeedChk(ierr); 902241a4b83SYohann ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp); 903241a4b83SYohann CeedChk(ierr); 904241a4b83SYohann // Basis action 905241a4b83SYohann switch (emode) { 906241a4b83SYohann case CEED_EVAL_NONE: 907241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 908241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 909241a4b83SYohann code << " readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n"; 910241a4b83SYohann break; 911241a4b83SYohann case CEED_EVAL_INTERP: 912241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 913241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 914241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 915241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 916241a4b83SYohann code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 917241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 918241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 919241a4b83SYohann break; 920241a4b83SYohann case CEED_EVAL_GRAD: 921241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 922241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 923241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 924241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 925241a4b83SYohann code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 926241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 927241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 928241a4b83SYohann break; 929241a4b83SYohann case CEED_EVAL_WEIGHT: 930241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 931241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 932241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 933241a4b83SYohann data->W = basis_data->d_qweight1d; 934241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 935241a4b83SYohann break; // No action 936241a4b83SYohann case CEED_EVAL_DIV: 937241a4b83SYohann break; // TODO: Not implemented 938241a4b83SYohann case CEED_EVAL_CURL: 939241a4b83SYohann break; // TODO: Not implemented 940241a4b83SYohann } 941241a4b83SYohann } 942241a4b83SYohann // Q function 943241a4b83SYohann code << "// QFunction\n"; 944241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 945241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 946241a4b83SYohann CeedChk(ierr); 947241a4b83SYohann if (emode==CEED_EVAL_GRAD) 948241a4b83SYohann { 949241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 950241a4b83SYohann } 951241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 952241a4b83SYohann { 953241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 954241a4b83SYohann } 955241a4b83SYohann } 956241a4b83SYohann //TODO write qfunction load for this backend 957241a4b83SYohann string qFunctionName(qf_data->qFunctionName); 958241a4b83SYohann code << " "<<qFunctionName<<"(ctx, "<<(dim==3?"Q1d":"1")<<", "; 959241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 960241a4b83SYohann code << "r_t"<<i<<", "; 961241a4b83SYohann } 962241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 963241a4b83SYohann code << "r_tt"<<i; 964241a4b83SYohann if (i<numoutputfields-1) 965241a4b83SYohann { 966241a4b83SYohann code << ", "; 967241a4b83SYohann } 968241a4b83SYohann } 969241a4b83SYohann code << ");\n"; 970241a4b83SYohann 971241a4b83SYohann // Output basis apply if needed 972241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 973241a4b83SYohann code << "// Output field "<<i<<"\n"; 974241a4b83SYohann // Get elemsize, emode, ncomp 975241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 976241a4b83SYohann CeedChk(ierr); 977241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 978241a4b83SYohann CeedChk(ierr); 979241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 980241a4b83SYohann CeedChk(ierr); 981241a4b83SYohann ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp); 982241a4b83SYohann CeedChk(ierr); 983241a4b83SYohann // Basis action 984241a4b83SYohann switch (emode) { 985241a4b83SYohann case CEED_EVAL_NONE: 986241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 987241a4b83SYohann code << " writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n"; 988241a4b83SYohann break; // No action 989241a4b83SYohann case CEED_EVAL_INTERP: 990241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 991241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 992241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 993241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 994241a4b83SYohann data->indices.out[i] = restr_data->d_ind; 995241a4b83SYohann code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 996241a4b83SYohann break; 997241a4b83SYohann case CEED_EVAL_GRAD: 998241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 999241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1000241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 1001241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 1002241a4b83SYohann data->indices.out[i] = restr_data->d_ind; 1003241a4b83SYohann code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1004241a4b83SYohann break; 1005241a4b83SYohann case CEED_EVAL_WEIGHT: { 1006241a4b83SYohann Ceed ceed; 1007241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1008241a4b83SYohann return CeedError(ceed, 1, 1009241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1010241a4b83SYohann break; // Should not occur 1011241a4b83SYohann } 1012241a4b83SYohann case CEED_EVAL_DIV: 1013241a4b83SYohann break; // TODO: Not implemented 1014241a4b83SYohann case CEED_EVAL_CURL: 1015241a4b83SYohann break; // TODO: Not implemented 1016241a4b83SYohann } 1017241a4b83SYohann } 1018241a4b83SYohann 1019241a4b83SYohann code << " }\n"; 1020241a4b83SYohann code << "}\n\n"; 1021241a4b83SYohann 1022241a4b83SYohann // std::cout << code.str(); 1023241a4b83SYohann 1024241a4b83SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr); 1025241a4b83SYohann ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1026241a4b83SYohann CeedChk(ierr); 1027241a4b83SYohann 1028241a4b83SYohann ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1029241a4b83SYohann 1030241a4b83SYohann return 0; 1031241a4b83SYohann } 1032