1*241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2*241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3*241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4*241a4b83SYohann // 5*241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6*241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7*241a4b83SYohann // element discretizations for exascale applications. For more information and 8*241a4b83SYohann // source code availability see http://github.com/ceed. 9*241a4b83SYohann // 10*241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11*241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12*241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13*241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14*241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15*241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 16*241a4b83SYohann #include <ceed-backend.h> 17*241a4b83SYohann #include "ceed-cuda-gen.h" 18*241a4b83SYohann #include <iostream> 19*241a4b83SYohann #include <sstream> 20*241a4b83SYohann #include "../cuda/ceed-cuda.h" 21*241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h" 22*241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 23*241a4b83SYohann 24*241a4b83SYohann static const char *deviceFunctions = QUOTE( 25*241a4b83SYohann 26*241a4b83SYohann typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 27*241a4b83SYohann typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 28*241a4b83SYohann 29*241a4b83SYohann typedef struct { 30*241a4b83SYohann CeedInt tidx; 31*241a4b83SYohann CeedInt tidy; 32*241a4b83SYohann CeedInt tidz; 33*241a4b83SYohann CeedInt tid; 34*241a4b83SYohann CeedScalar* slice; 35*241a4b83SYohann } BackendData; 36*241a4b83SYohann 37*241a4b83SYohann #if __CUDA_ARCH__ < 600 38*241a4b83SYohann __device__ double atomicAdd(double *address, double val) { 39*241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 40*241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 41*241a4b83SYohann do { 42*241a4b83SYohann assumed = old; 43*241a4b83SYohann old = 44*241a4b83SYohann atomicCAS(address_as_ull, assumed, 45*241a4b83SYohann __double_as_longlong(val + 46*241a4b83SYohann __longlong_as_double(assumed))); 47*241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 48*241a4b83SYohann // (since NaN != NaN) 49*241a4b83SYohann } while (assumed != old); 50*241a4b83SYohann return __longlong_as_double(old); 51*241a4b83SYohann } 52*241a4b83SYohann #endif // __CUDA_ARCH__ < 600 53*241a4b83SYohann 54*241a4b83SYohann template <int P, int Q> 55*241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 56*241a4b83SYohann for(int i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) { 57*241a4b83SYohann B[i] = d_B[i]; 58*241a4b83SYohann } 59*241a4b83SYohann __syncthreads; 60*241a4b83SYohann } 61*241a4b83SYohann 62*241a4b83SYohann //**** 63*241a4b83SYohann // 1D 64*241a4b83SYohann template <int NCOMP, int P1d> 65*241a4b83SYohann inline __device__ void readDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 66*241a4b83SYohann if (data.tidx<P1d) 67*241a4b83SYohann { 68*241a4b83SYohann const CeedInt dof = data.tidx; 69*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 70*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 71*241a4b83SYohann r_u[comp] = d_u[ind + ndofs * comp]; 72*241a4b83SYohann } 73*241a4b83SYohann } 74*241a4b83SYohann } 75*241a4b83SYohann 76*241a4b83SYohann template <int NCOMP, int P1d> 77*241a4b83SYohann inline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 78*241a4b83SYohann if (data.tidx<P1d) 79*241a4b83SYohann { 80*241a4b83SYohann const CeedInt dof = data.tidx; 81*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 82*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 83*241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 84*241a4b83SYohann } 85*241a4b83SYohann } 86*241a4b83SYohann } 87*241a4b83SYohann 88*241a4b83SYohann template <int NCOMP, int Q1d> 89*241a4b83SYohann inline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 90*241a4b83SYohann const CeedInt dof = data.tidx; 91*241a4b83SYohann const CeedInt ind = dof + elem * Q1d; 92*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 93*241a4b83SYohann r_u[comp] = d_u[ind + nquads * comp]; 94*241a4b83SYohann } 95*241a4b83SYohann } 96*241a4b83SYohann 97*241a4b83SYohann template <int NCOMP, int Q1d> 98*241a4b83SYohann inline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 99*241a4b83SYohann const CeedInt dof = data.tidx; 100*241a4b83SYohann const CeedInt ind = dof + elem * Q1d; 101*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 102*241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 103*241a4b83SYohann } 104*241a4b83SYohann } 105*241a4b83SYohann 106*241a4b83SYohann template <int NCOMP, int P1d> 107*241a4b83SYohann inline __device__ void writeDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 108*241a4b83SYohann if (data.tidx<P1d) 109*241a4b83SYohann { 110*241a4b83SYohann const CeedInt dof = data.tidx; 111*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 112*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 113*241a4b83SYohann atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]); 114*241a4b83SYohann } 115*241a4b83SYohann } 116*241a4b83SYohann } 117*241a4b83SYohann 118*241a4b83SYohann template <int NCOMP, int P1d> 119*241a4b83SYohann inline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 120*241a4b83SYohann if (data.tidx<P1d) 121*241a4b83SYohann { 122*241a4b83SYohann const CeedInt dof = data.tidx; 123*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d; 124*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 125*241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 126*241a4b83SYohann } 127*241a4b83SYohann } 128*241a4b83SYohann } 129*241a4b83SYohann 130*241a4b83SYohann template <int NCOMP, int Q1d> 131*241a4b83SYohann inline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 132*241a4b83SYohann const CeedInt dof = data.tidx; 133*241a4b83SYohann const CeedInt ind = dof + elem * Q1d; 134*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 135*241a4b83SYohann d_v[ind + nquads * comp] = r_v[comp]; 136*241a4b83SYohann } 137*241a4b83SYohann } 138*241a4b83SYohann 139*241a4b83SYohann template <int NCOMP, int Q1d> 140*241a4b83SYohann inline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 141*241a4b83SYohann const CeedInt dof = data.tidx; 142*241a4b83SYohann const CeedInt ind = dof + elem * Q1d; 143*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 144*241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[comp]; 145*241a4b83SYohann } 146*241a4b83SYohann } 147*241a4b83SYohann 148*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 149*241a4b83SYohann inline __device__ void ContractX1d(BackendData& data, 150*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 151*241a4b83SYohann data.slice[data.tidx] = *U; 152*241a4b83SYohann __syncthreads(); 153*241a4b83SYohann *V = 0.0; 154*241a4b83SYohann for (int i = 0; i < P1d; ++i) { 155*241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction 156*241a4b83SYohann } 157*241a4b83SYohann __syncthreads(); 158*241a4b83SYohann } 159*241a4b83SYohann 160*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 161*241a4b83SYohann inline __device__ void ContractTransposeX1d(BackendData& data, 162*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 163*241a4b83SYohann data.slice[data.tidx] = *U; 164*241a4b83SYohann __syncthreads(); 165*241a4b83SYohann *V = 0.0; 166*241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 167*241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction 168*241a4b83SYohann } 169*241a4b83SYohann __syncthreads(); 170*241a4b83SYohann } 171*241a4b83SYohann 172*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 173*241a4b83SYohann inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 174*241a4b83SYohann CeedScalar *__restrict__ r_V) { 175*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 176*241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 177*241a4b83SYohann } 178*241a4b83SYohann } 179*241a4b83SYohann 180*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 181*241a4b83SYohann inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 182*241a4b83SYohann CeedScalar *__restrict__ r_V) { 183*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 184*241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp); 185*241a4b83SYohann } 186*241a4b83SYohann } 187*241a4b83SYohann 188*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 189*241a4b83SYohann inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, 190*241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 191*241a4b83SYohann CeedScalar *__restrict__ r_V) { 192*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 193*241a4b83SYohann ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 194*241a4b83SYohann } 195*241a4b83SYohann } 196*241a4b83SYohann 197*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 198*241a4b83SYohann inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, 199*241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 200*241a4b83SYohann CeedScalar *__restrict__ r_V) { 201*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 202*241a4b83SYohann ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp); 203*241a4b83SYohann } 204*241a4b83SYohann } 205*241a4b83SYohann 206*241a4b83SYohann //**** 207*241a4b83SYohann // 2D 208*241a4b83SYohann template <int NCOMP, int P1d> 209*241a4b83SYohann inline __device__ void readDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 210*241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 211*241a4b83SYohann { 212*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d; 213*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 214*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 215*241a4b83SYohann r_u[comp] = d_u[ind + ndofs * comp]; 216*241a4b83SYohann } 217*241a4b83SYohann } 218*241a4b83SYohann } 219*241a4b83SYohann 220*241a4b83SYohann template <int NCOMP, int P1d> 221*241a4b83SYohann inline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 222*241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 223*241a4b83SYohann { 224*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d; 225*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 226*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 227*241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 228*241a4b83SYohann } 229*241a4b83SYohann } 230*241a4b83SYohann } 231*241a4b83SYohann 232*241a4b83SYohann template <int NCOMP, int Q1d> 233*241a4b83SYohann inline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 234*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d; 235*241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d; 236*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 237*241a4b83SYohann r_u[comp] = d_u[ind + nquads * comp]; 238*241a4b83SYohann } 239*241a4b83SYohann } 240*241a4b83SYohann 241*241a4b83SYohann template <int NCOMP, int Q1d> 242*241a4b83SYohann inline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 243*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d; 244*241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d; 245*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 246*241a4b83SYohann r_u[comp] = d_u[ind * NCOMP + comp]; 247*241a4b83SYohann } 248*241a4b83SYohann } 249*241a4b83SYohann 250*241a4b83SYohann template <int NCOMP, int P1d> 251*241a4b83SYohann inline __device__ void writeDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 252*241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 253*241a4b83SYohann { 254*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d; 255*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 256*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 257*241a4b83SYohann atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]); 258*241a4b83SYohann } 259*241a4b83SYohann } 260*241a4b83SYohann } 261*241a4b83SYohann 262*241a4b83SYohann template <int NCOMP, int P1d> 263*241a4b83SYohann inline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 264*241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) 265*241a4b83SYohann { 266*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d; 267*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d; 268*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 269*241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]); 270*241a4b83SYohann } 271*241a4b83SYohann } 272*241a4b83SYohann } 273*241a4b83SYohann 274*241a4b83SYohann template <int NCOMP, int Q1d> 275*241a4b83SYohann inline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 276*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d; 277*241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d; 278*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 279*241a4b83SYohann d_v[ind + nquads * comp] = r_v[comp]; 280*241a4b83SYohann } 281*241a4b83SYohann } 282*241a4b83SYohann 283*241a4b83SYohann template <int NCOMP, int Q1d> 284*241a4b83SYohann inline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 285*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d; 286*241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d; 287*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 288*241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[comp]; 289*241a4b83SYohann } 290*241a4b83SYohann } 291*241a4b83SYohann 292*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 293*241a4b83SYohann inline __device__ void ContractX2d(BackendData& data, 294*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 295*241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 296*241a4b83SYohann __syncthreads(); 297*241a4b83SYohann *V = 0.0; 298*241a4b83SYohann for (int i = 0; i < P1d; ++i) { 299*241a4b83SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 300*241a4b83SYohann } 301*241a4b83SYohann __syncthreads(); 302*241a4b83SYohann } 303*241a4b83SYohann 304*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 305*241a4b83SYohann inline __device__ void ContractY2d(BackendData& data, 306*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 307*241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 308*241a4b83SYohann __syncthreads(); 309*241a4b83SYohann *V = 0.0; 310*241a4b83SYohann for (int i = 0; i < P1d; ++i) { 311*241a4b83SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 312*241a4b83SYohann } 313*241a4b83SYohann __syncthreads(); 314*241a4b83SYohann } 315*241a4b83SYohann 316*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 317*241a4b83SYohann inline __device__ void ContractYTranspose2d(BackendData& data, 318*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 319*241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 320*241a4b83SYohann __syncthreads(); 321*241a4b83SYohann *V = 0.0; 322*241a4b83SYohann if (data.tidy<P1d) { 323*241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 324*241a4b83SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 325*241a4b83SYohann } 326*241a4b83SYohann } 327*241a4b83SYohann __syncthreads(); 328*241a4b83SYohann } 329*241a4b83SYohann 330*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 331*241a4b83SYohann inline __device__ void ContractXTranspose2d(BackendData& data, 332*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 333*241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 334*241a4b83SYohann __syncthreads(); 335*241a4b83SYohann *V = 0.0; 336*241a4b83SYohann if (data.tidx<P1d) { 337*241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 338*241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 339*241a4b83SYohann } 340*241a4b83SYohann } 341*241a4b83SYohann __syncthreads(); 342*241a4b83SYohann } 343*241a4b83SYohann 344*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 345*241a4b83SYohann inline __device__ void ContractXTransposeAdd2d(BackendData& data, 346*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 347*241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 348*241a4b83SYohann __syncthreads(); 349*241a4b83SYohann if (data.tidx<P1d) { 350*241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 351*241a4b83SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 352*241a4b83SYohann } 353*241a4b83SYohann } 354*241a4b83SYohann __syncthreads(); 355*241a4b83SYohann } 356*241a4b83SYohann 357*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 358*241a4b83SYohann inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 359*241a4b83SYohann CeedScalar *__restrict__ r_V) { 360*241a4b83SYohann CeedScalar r_t[1]; 361*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 362*241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 363*241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 364*241a4b83SYohann } 365*241a4b83SYohann } 366*241a4b83SYohann 367*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 368*241a4b83SYohann inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 369*241a4b83SYohann CeedScalar *__restrict__ r_V) { 370*241a4b83SYohann CeedScalar r_t[1]; 371*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 372*241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 373*241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 374*241a4b83SYohann } 375*241a4b83SYohann } 376*241a4b83SYohann 377*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 378*241a4b83SYohann inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, 379*241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 380*241a4b83SYohann CeedScalar *__restrict__ r_V) { 381*241a4b83SYohann CeedScalar r_t[1]; 382*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 383*241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t); 384*241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP); 385*241a4b83SYohann ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t); 386*241a4b83SYohann ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP); 387*241a4b83SYohann } 388*241a4b83SYohann } 389*241a4b83SYohann 390*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 391*241a4b83SYohann inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, 392*241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 393*241a4b83SYohann CeedScalar *__restrict__ r_V) { 394*241a4b83SYohann CeedScalar r_t[1]; 395*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 396*241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t); 397*241a4b83SYohann ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp); 398*241a4b83SYohann ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t); 399*241a4b83SYohann ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp); 400*241a4b83SYohann } 401*241a4b83SYohann } 402*241a4b83SYohann 403*241a4b83SYohann //**** 404*241a4b83SYohann // 3D 405*241a4b83SYohann template <int NCOMP, int P1d> 406*241a4b83SYohann inline __device__ void readDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 407*241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 408*241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 409*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 410*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 411*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 412*241a4b83SYohann r_u[z+comp*P1d] = d_u[ind + ndofs * comp]; 413*241a4b83SYohann } 414*241a4b83SYohann } 415*241a4b83SYohann } 416*241a4b83SYohann } 417*241a4b83SYohann 418*241a4b83SYohann template <int NCOMP, int P1d> 419*241a4b83SYohann inline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 420*241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 421*241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 422*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 423*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 424*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 425*241a4b83SYohann r_u[z+comp*P1d] = d_u[ind * NCOMP + comp]; 426*241a4b83SYohann } 427*241a4b83SYohann } 428*241a4b83SYohann } 429*241a4b83SYohann } 430*241a4b83SYohann 431*241a4b83SYohann template <int NCOMP, int Q1d> 432*241a4b83SYohann inline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 433*241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 434*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 435*241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 436*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 437*241a4b83SYohann r_u[z+comp*Q1d] = d_u[ind + nquads * comp]; 438*241a4b83SYohann } 439*241a4b83SYohann } 440*241a4b83SYohann } 441*241a4b83SYohann 442*241a4b83SYohann template <int NCOMP, int Q1d> 443*241a4b83SYohann inline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 444*241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 445*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 446*241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 447*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 448*241a4b83SYohann r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp]; 449*241a4b83SYohann } 450*241a4b83SYohann } 451*241a4b83SYohann } 452*241a4b83SYohann 453*241a4b83SYohann template <int NCOMP, int P1d> 454*241a4b83SYohann inline __device__ void writeDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 455*241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 456*241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 457*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 458*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 459*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 460*241a4b83SYohann atomicAdd(&d_v[ind + ndofs * comp], r_v[z+comp*P1d]); 461*241a4b83SYohann } 462*241a4b83SYohann } 463*241a4b83SYohann } 464*241a4b83SYohann } 465*241a4b83SYohann 466*241a4b83SYohann template <int NCOMP, int P1d> 467*241a4b83SYohann inline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 468*241a4b83SYohann if (data.tidx<P1d && data.tidy<P1d) { 469*241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 470*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d; 471*241a4b83SYohann const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d; 472*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 473*241a4b83SYohann atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]); 474*241a4b83SYohann } 475*241a4b83SYohann } 476*241a4b83SYohann } 477*241a4b83SYohann } 478*241a4b83SYohann 479*241a4b83SYohann template <int NCOMP, int Q1d> 480*241a4b83SYohann inline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 481*241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 482*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 483*241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 484*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 485*241a4b83SYohann d_v[ind + nquads * comp] = r_v[z+comp*Q1d]; 486*241a4b83SYohann } 487*241a4b83SYohann } 488*241a4b83SYohann } 489*241a4b83SYohann 490*241a4b83SYohann template <int NCOMP, int Q1d> 491*241a4b83SYohann inline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 492*241a4b83SYohann for(CeedInt z=0; z < Q1d; ++z) { 493*241a4b83SYohann const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d; 494*241a4b83SYohann const CeedInt ind = dof + elem * Q1d*Q1d*Q1d; 495*241a4b83SYohann for(CeedInt comp = 0; comp < NCOMP; ++comp) { 496*241a4b83SYohann d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d]; 497*241a4b83SYohann } 498*241a4b83SYohann } 499*241a4b83SYohann } 500*241a4b83SYohann 501*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 502*241a4b83SYohann inline __device__ void ContractX3d(BackendData& data, 503*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 504*241a4b83SYohann for (int k = 0; k < P1d; ++k) { 505*241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 506*241a4b83SYohann __syncthreads(); 507*241a4b83SYohann V[k] = 0.0; 508*241a4b83SYohann for (int i = 0; i < P1d; ++i) { 509*241a4b83SYohann V[k] += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 510*241a4b83SYohann } 511*241a4b83SYohann __syncthreads(); 512*241a4b83SYohann } 513*241a4b83SYohann } 514*241a4b83SYohann 515*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 516*241a4b83SYohann inline __device__ void ContractY3d(BackendData& data, 517*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 518*241a4b83SYohann for (int k = 0; k < P1d; ++k) { 519*241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 520*241a4b83SYohann __syncthreads(); 521*241a4b83SYohann V[k] = 0.0; 522*241a4b83SYohann for (int i = 0; i < P1d; ++i) { 523*241a4b83SYohann V[k] += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 524*241a4b83SYohann } 525*241a4b83SYohann __syncthreads(); 526*241a4b83SYohann } 527*241a4b83SYohann } 528*241a4b83SYohann 529*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 530*241a4b83SYohann inline __device__ void ContractZ3d(BackendData& data, 531*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 532*241a4b83SYohann for (int k = 0; k < Q1d; ++k) { 533*241a4b83SYohann V[k] = 0.0; 534*241a4b83SYohann for (int i = 0; i < P1d; ++i) { 535*241a4b83SYohann V[k] += B[i + k*P1d] * U[i];//contract z direction 536*241a4b83SYohann } 537*241a4b83SYohann } 538*241a4b83SYohann } 539*241a4b83SYohann 540*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 541*241a4b83SYohann inline __device__ void ContractTransposeZ3d(BackendData& data, 542*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 543*241a4b83SYohann for (int k = 0; k < Q1d; ++k) { 544*241a4b83SYohann V[k] = 0.0; 545*241a4b83SYohann if (k<P1d) { 546*241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 547*241a4b83SYohann V[k] += B[k + i*P1d] * U[i];//contract z direction 548*241a4b83SYohann } 549*241a4b83SYohann } 550*241a4b83SYohann } 551*241a4b83SYohann } 552*241a4b83SYohann 553*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 554*241a4b83SYohann inline __device__ void ContractTransposeY3d(BackendData& data, 555*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 556*241a4b83SYohann for (int k = 0; k < P1d; ++k) { 557*241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 558*241a4b83SYohann __syncthreads(); 559*241a4b83SYohann V[k] = 0.0; 560*241a4b83SYohann if (data.tidy<P1d) { 561*241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 562*241a4b83SYohann V[k] += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction 563*241a4b83SYohann } 564*241a4b83SYohann } 565*241a4b83SYohann __syncthreads(); 566*241a4b83SYohann } 567*241a4b83SYohann } 568*241a4b83SYohann 569*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 570*241a4b83SYohann inline __device__ void ContractTransposeX3d(BackendData& data, 571*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 572*241a4b83SYohann for (int k = 0; k < P1d; ++k) { 573*241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 574*241a4b83SYohann __syncthreads(); 575*241a4b83SYohann V[k] = 0.0; 576*241a4b83SYohann if (data.tidx<P1d) { 577*241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 578*241a4b83SYohann V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 579*241a4b83SYohann } 580*241a4b83SYohann } 581*241a4b83SYohann __syncthreads(); 582*241a4b83SYohann } 583*241a4b83SYohann } 584*241a4b83SYohann 585*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 586*241a4b83SYohann inline __device__ void ContractTransposeAddX3d(BackendData& data, 587*241a4b83SYohann const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 588*241a4b83SYohann for (int k = 0; k < P1d; ++k) { 589*241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 590*241a4b83SYohann __syncthreads(); 591*241a4b83SYohann if (data.tidx<P1d) { 592*241a4b83SYohann for (int i = 0; i < Q1d; ++i) { 593*241a4b83SYohann V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction 594*241a4b83SYohann } 595*241a4b83SYohann } 596*241a4b83SYohann __syncthreads(); 597*241a4b83SYohann } 598*241a4b83SYohann } 599*241a4b83SYohann 600*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 601*241a4b83SYohann inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 602*241a4b83SYohann CeedScalar *__restrict__ r_V) { 603*241a4b83SYohann CeedScalar r_t1[Q1d]; 604*241a4b83SYohann CeedScalar r_t2[Q1d]; 605*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 606*241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 607*241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 608*241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d); 609*241a4b83SYohann } 610*241a4b83SYohann } 611*241a4b83SYohann 612*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 613*241a4b83SYohann inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 614*241a4b83SYohann CeedScalar *__restrict__ r_V) { 615*241a4b83SYohann CeedScalar r_t1[Q1d]; 616*241a4b83SYohann CeedScalar r_t2[Q1d]; 617*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 618*241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1); 619*241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 620*241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 621*241a4b83SYohann } 622*241a4b83SYohann } 623*241a4b83SYohann 624*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 625*241a4b83SYohann inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, 626*241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 627*241a4b83SYohann CeedScalar *__restrict__ r_V) { 628*241a4b83SYohann CeedScalar r_t1[Q1d]; 629*241a4b83SYohann CeedScalar r_t2[Q1d]; 630*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 631*241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1); 632*241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 633*241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d); 634*241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 635*241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 636*241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d); 637*241a4b83SYohann ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1); 638*241a4b83SYohann ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 639*241a4b83SYohann ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d); 640*241a4b83SYohann } 641*241a4b83SYohann } 642*241a4b83SYohann 643*241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 644*241a4b83SYohann inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, 645*241a4b83SYohann const CeedScalar *c_B, const CeedScalar *c_G, 646*241a4b83SYohann CeedScalar *__restrict__ r_V) { 647*241a4b83SYohann CeedScalar r_t1[Q1d]; 648*241a4b83SYohann CeedScalar r_t2[Q1d]; 649*241a4b83SYohann for(int comp=0; comp<NCOMP; comp++) { 650*241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1); 651*241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 652*241a4b83SYohann ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d); 653*241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1); 654*241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2); 655*241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 656*241a4b83SYohann ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1); 657*241a4b83SYohann ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2); 658*241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d); 659*241a4b83SYohann } 660*241a4b83SYohann } 661*241a4b83SYohann 662*241a4b83SYohann template <int Q1d> 663*241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 664*241a4b83SYohann *w = qweight1d[data.tidx]; 665*241a4b83SYohann } 666*241a4b83SYohann 667*241a4b83SYohann template <int Q1d> 668*241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 669*241a4b83SYohann *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 670*241a4b83SYohann } 671*241a4b83SYohann 672*241a4b83SYohann template <int Q1d> 673*241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 674*241a4b83SYohann const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 675*241a4b83SYohann for (int z = 0; z < Q1d; ++z) 676*241a4b83SYohann { 677*241a4b83SYohann w[z] = pw*qweight1d[z]; 678*241a4b83SYohann } 679*241a4b83SYohann } 680*241a4b83SYohann 681*241a4b83SYohann ); 682*241a4b83SYohann 683*241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 684*241a4b83SYohann 685*241a4b83SYohann using std::ostringstream; 686*241a4b83SYohann using std::string; 687*241a4b83SYohann int ierr; 688*241a4b83SYohann bool setupdone; 689*241a4b83SYohann ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr); 690*241a4b83SYohann if (setupdone) return 0; 691*241a4b83SYohann Ceed ceed; 692*241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 693*241a4b83SYohann CeedOperator_Cuda_gen *data; 694*241a4b83SYohann ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 695*241a4b83SYohann CeedQFunction qf; 696*241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 697*241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 698*241a4b83SYohann ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 699*241a4b83SYohann CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, ndof; 700*241a4b83SYohann ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 701*241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 702*241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 703*241a4b83SYohann CeedChk(ierr); 704*241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 705*241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 706*241a4b83SYohann CeedChk(ierr); 707*241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 708*241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 709*241a4b83SYohann CeedChk(ierr); 710*241a4b83SYohann CeedEvalMode emode; 711*241a4b83SYohann CeedTransposeMode lmode; 712*241a4b83SYohann CeedBasis basis; 713*241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 714*241a4b83SYohann CeedElemRestriction Erestrict; 715*241a4b83SYohann CeedElemRestriction_Cuda_reg *restr_data; 716*241a4b83SYohann 717*241a4b83SYohann ostringstream code; 718*241a4b83SYohann string devFunctions(deviceFunctions); 719*241a4b83SYohann 720*241a4b83SYohann code << devFunctions; 721*241a4b83SYohann 722*241a4b83SYohann string qFunction(qf_data->qFunctionSource); 723*241a4b83SYohann code << qFunction; 724*241a4b83SYohann 725*241a4b83SYohann // Setup 726*241a4b83SYohann code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 727*241a4b83SYohann // Input Evecs and Restriction 728*241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 729*241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 730*241a4b83SYohann CeedChk(ierr); 731*241a4b83SYohann if (emode == CEED_EVAL_WEIGHT) { // Skip 732*241a4b83SYohann } else { 733*241a4b83SYohann code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 734*241a4b83SYohann if (emode != CEED_EVAL_NONE) 735*241a4b83SYohann { 736*241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 737*241a4b83SYohann bool isTensor; 738*241a4b83SYohann ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr); 739*241a4b83SYohann //TODO check that all are the same 740*241a4b83SYohann ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 741*241a4b83SYohann if (isTensor) 742*241a4b83SYohann { 743*241a4b83SYohann //TODO check that all are the same 744*241a4b83SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 745*241a4b83SYohann } else { 746*241a4b83SYohann return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 747*241a4b83SYohann } 748*241a4b83SYohann } 749*241a4b83SYohann } 750*241a4b83SYohann } 751*241a4b83SYohann data->dim = dim; 752*241a4b83SYohann data->Q1d = Q1d; 753*241a4b83SYohann 754*241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 755*241a4b83SYohann code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 756*241a4b83SYohann } 757*241a4b83SYohann code << "const CeedInt Dim = "<<dim<<";\n"; 758*241a4b83SYohann code << "const CeedInt Q1d = "<<Q1d<<";\n"; 759*241a4b83SYohann // code << "const CeedInt Q = "<<Q<<";\n"; 760*241a4b83SYohann code << "extern __shared__ CeedScalar slice[];\n"; 761*241a4b83SYohann code << "BackendData data;\n"; 762*241a4b83SYohann code << "data.tidx = threadIdx.x;\n"; 763*241a4b83SYohann code << "data.tidy = threadIdx.y;\n"; 764*241a4b83SYohann code << "data.tidz = threadIdx.z;\n"; 765*241a4b83SYohann code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 766*241a4b83SYohann code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 767*241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 768*241a4b83SYohann code << "// Input field "<<i<<"\n"; 769*241a4b83SYohann // Get elemsize, emode, ncomp 770*241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 771*241a4b83SYohann CeedChk(ierr); 772*241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 773*241a4b83SYohann CeedChk(ierr); 774*241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 775*241a4b83SYohann CeedChk(ierr); 776*241a4b83SYohann ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp); 777*241a4b83SYohann CeedChk(ierr); 778*241a4b83SYohann // Basis action 779*241a4b83SYohann switch (emode) { 780*241a4b83SYohann case CEED_EVAL_NONE: 781*241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 782*241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 783*241a4b83SYohann code << " const CeedInt nquads_in_"<<i<<" = "<<ndof/ncomp<<";\n"; 784*241a4b83SYohann break; 785*241a4b83SYohann case CEED_EVAL_INTERP: 786*241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 787*241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 788*241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 789*241a4b83SYohann code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 790*241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 791*241a4b83SYohann code << " const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n"; 792*241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 793*241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 794*241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 795*241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 796*241a4b83SYohann break; 797*241a4b83SYohann case CEED_EVAL_GRAD: 798*241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 799*241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 800*241a4b83SYohann code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 801*241a4b83SYohann code << " const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n"; 802*241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 803*241a4b83SYohann code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 804*241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 805*241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 806*241a4b83SYohann data->G.in[i] = basis_data->d_grad1d; 807*241a4b83SYohann code << " __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 808*241a4b83SYohann code << " __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 809*241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 810*241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 811*241a4b83SYohann break; 812*241a4b83SYohann case CEED_EVAL_WEIGHT: 813*241a4b83SYohann break; // No action 814*241a4b83SYohann case CEED_EVAL_DIV: 815*241a4b83SYohann break; // TODO: Not implemented 816*241a4b83SYohann case CEED_EVAL_CURL: 817*241a4b83SYohann break; // TODO: Not implemented 818*241a4b83SYohann } 819*241a4b83SYohann } 820*241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 821*241a4b83SYohann code << "// Output field "<<i<<"\n"; 822*241a4b83SYohann // Get elemsize, emode, ncomp 823*241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 824*241a4b83SYohann CeedChk(ierr); 825*241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 826*241a4b83SYohann CeedChk(ierr); 827*241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 828*241a4b83SYohann CeedChk(ierr); 829*241a4b83SYohann ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp); 830*241a4b83SYohann CeedChk(ierr); 831*241a4b83SYohann // Basis action 832*241a4b83SYohann switch (emode) { 833*241a4b83SYohann case CEED_EVAL_NONE: 834*241a4b83SYohann code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 835*241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 836*241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 837*241a4b83SYohann code << " const CeedInt nquads_out_"<<i<<" = "<<ndof<<"/ncomp_out_"<<i<<";\n"; 838*241a4b83SYohann break; // No action 839*241a4b83SYohann case CEED_EVAL_INTERP: 840*241a4b83SYohann code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 841*241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 842*241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 843*241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 844*241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 845*241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 846*241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 847*241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 848*241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 849*241a4b83SYohann code << " const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n"; 850*241a4b83SYohann break; 851*241a4b83SYohann case CEED_EVAL_GRAD: 852*241a4b83SYohann code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 853*241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 854*241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 855*241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 856*241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 857*241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 858*241a4b83SYohann data->G.out[i] = basis_data->d_grad1d; 859*241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 860*241a4b83SYohann code << " __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 861*241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 862*241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 863*241a4b83SYohann ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr); 864*241a4b83SYohann code << " const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n"; 865*241a4b83SYohann break; 866*241a4b83SYohann case CEED_EVAL_WEIGHT: { 867*241a4b83SYohann Ceed ceed; 868*241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 869*241a4b83SYohann return CeedError(ceed, 1, 870*241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 871*241a4b83SYohann break; // Should not occur 872*241a4b83SYohann } 873*241a4b83SYohann case CEED_EVAL_DIV: 874*241a4b83SYohann break; // TODO: Not implemented 875*241a4b83SYohann case CEED_EVAL_CURL: 876*241a4b83SYohann break; // TODO: Not implemented 877*241a4b83SYohann } 878*241a4b83SYohann } 879*241a4b83SYohann code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 880*241a4b83SYohann // Input basis apply if needed 881*241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 882*241a4b83SYohann code << "// Input field "<<i<<"\n"; 883*241a4b83SYohann // Get elemsize, emode, ncomp 884*241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 885*241a4b83SYohann CeedChk(ierr); 886*241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 887*241a4b83SYohann CeedChk(ierr); 888*241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 889*241a4b83SYohann CeedChk(ierr); 890*241a4b83SYohann ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp); 891*241a4b83SYohann CeedChk(ierr); 892*241a4b83SYohann // Basis action 893*241a4b83SYohann switch (emode) { 894*241a4b83SYohann case CEED_EVAL_NONE: 895*241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 896*241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 897*241a4b83SYohann code << " readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n"; 898*241a4b83SYohann break; 899*241a4b83SYohann case CEED_EVAL_INTERP: 900*241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 901*241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 902*241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 903*241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 904*241a4b83SYohann code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 905*241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 906*241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 907*241a4b83SYohann break; 908*241a4b83SYohann case CEED_EVAL_GRAD: 909*241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 910*241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr); 911*241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 912*241a4b83SYohann data->indices.in[i] = restr_data->d_ind; 913*241a4b83SYohann code << " readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 914*241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 915*241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 916*241a4b83SYohann break; 917*241a4b83SYohann case CEED_EVAL_WEIGHT: 918*241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 919*241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 920*241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 921*241a4b83SYohann data->W = basis_data->d_qweight1d; 922*241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 923*241a4b83SYohann break; // No action 924*241a4b83SYohann case CEED_EVAL_DIV: 925*241a4b83SYohann break; // TODO: Not implemented 926*241a4b83SYohann case CEED_EVAL_CURL: 927*241a4b83SYohann break; // TODO: Not implemented 928*241a4b83SYohann } 929*241a4b83SYohann } 930*241a4b83SYohann // Q function 931*241a4b83SYohann code << "// QFunction\n"; 932*241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 933*241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 934*241a4b83SYohann CeedChk(ierr); 935*241a4b83SYohann if (emode==CEED_EVAL_GRAD) 936*241a4b83SYohann { 937*241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 938*241a4b83SYohann } 939*241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 940*241a4b83SYohann { 941*241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 942*241a4b83SYohann } 943*241a4b83SYohann } 944*241a4b83SYohann //TODO write qfunction load for this backend 945*241a4b83SYohann string qFunctionName(qf_data->qFunctionName); 946*241a4b83SYohann code << " "<<qFunctionName<<"(ctx, "<<(dim==3?"Q1d":"1")<<", "; 947*241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 948*241a4b83SYohann code << "r_t"<<i<<", "; 949*241a4b83SYohann } 950*241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 951*241a4b83SYohann code << "r_tt"<<i; 952*241a4b83SYohann if (i<numoutputfields-1) 953*241a4b83SYohann { 954*241a4b83SYohann code << ", "; 955*241a4b83SYohann } 956*241a4b83SYohann } 957*241a4b83SYohann code << ");\n"; 958*241a4b83SYohann 959*241a4b83SYohann // Output basis apply if needed 960*241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 961*241a4b83SYohann code << "// Output field "<<i<<"\n"; 962*241a4b83SYohann // Get elemsize, emode, ncomp 963*241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 964*241a4b83SYohann CeedChk(ierr); 965*241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 966*241a4b83SYohann CeedChk(ierr); 967*241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 968*241a4b83SYohann CeedChk(ierr); 969*241a4b83SYohann ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp); 970*241a4b83SYohann CeedChk(ierr); 971*241a4b83SYohann // Basis action 972*241a4b83SYohann switch (emode) { 973*241a4b83SYohann case CEED_EVAL_NONE: 974*241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 975*241a4b83SYohann code << " writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n"; 976*241a4b83SYohann break; // No action 977*241a4b83SYohann case CEED_EVAL_INTERP: 978*241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 979*241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 980*241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 981*241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 982*241a4b83SYohann data->indices.out[i] = restr_data->d_ind; 983*241a4b83SYohann code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 984*241a4b83SYohann break; 985*241a4b83SYohann case CEED_EVAL_GRAD: 986*241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 987*241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 988*241a4b83SYohann ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr); 989*241a4b83SYohann ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 990*241a4b83SYohann data->indices.out[i] = restr_data->d_ind; 991*241a4b83SYohann code << " writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 992*241a4b83SYohann break; 993*241a4b83SYohann case CEED_EVAL_WEIGHT: { 994*241a4b83SYohann Ceed ceed; 995*241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 996*241a4b83SYohann return CeedError(ceed, 1, 997*241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 998*241a4b83SYohann break; // Should not occur 999*241a4b83SYohann } 1000*241a4b83SYohann case CEED_EVAL_DIV: 1001*241a4b83SYohann break; // TODO: Not implemented 1002*241a4b83SYohann case CEED_EVAL_CURL: 1003*241a4b83SYohann break; // TODO: Not implemented 1004*241a4b83SYohann } 1005*241a4b83SYohann } 1006*241a4b83SYohann 1007*241a4b83SYohann code << " }\n"; 1008*241a4b83SYohann code << "}\n\n"; 1009*241a4b83SYohann 1010*241a4b83SYohann // std::cout << code.str(); 1011*241a4b83SYohann 1012*241a4b83SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr); 1013*241a4b83SYohann ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); 1014*241a4b83SYohann CeedChk(ierr); 1015*241a4b83SYohann 1016*241a4b83SYohann ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1017*241a4b83SYohann 1018*241a4b83SYohann return 0; 1019*241a4b83SYohann } 1020