1241a4b83SYohann // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2241a4b83SYohann // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3241a4b83SYohann // All Rights reserved. See files LICENSE and NOTICE for details. 4241a4b83SYohann // 5241a4b83SYohann // This file is part of CEED, a collection of benchmarks, miniapps, software 6241a4b83SYohann // libraries and APIs for efficient high-order finite element and spectral 7241a4b83SYohann // element discretizations for exascale applications. For more information and 8241a4b83SYohann // source code availability see http://github.com/ceed. 9241a4b83SYohann // 10241a4b83SYohann // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11241a4b83SYohann // a collaborative effort of two U.S. Department of Energy organizations (Office 12241a4b83SYohann // of Science and the National Nuclear Security Administration) responsible for 13241a4b83SYohann // the planning and preparation of a capable exascale ecosystem, including 14241a4b83SYohann // software, applications, hardware, advanced system engineering and early 15241a4b83SYohann // testbed platforms, in support of the nation's exascale computing imperative. 167df94212SJeremy L Thompson 17241a4b83SYohann #include "ceed-cuda-gen.h" 18241a4b83SYohann #include <iostream> 19241a4b83SYohann #include <sstream> 20241a4b83SYohann #include "../cuda-reg/ceed-cuda-reg.h" 21241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 22241a4b83SYohann 23f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 24ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 25ab213215SJeremy L Thompson // Atomic add, for older CUDA 26ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 27241a4b83SYohann __device__ double atomicAdd(double *address, double val) { 28241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 29241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 30241a4b83SYohann do { 31241a4b83SYohann assumed = old; 32241a4b83SYohann old = 33241a4b83SYohann atomicCAS(address_as_ull, assumed, 34241a4b83SYohann __double_as_longlong(val + 35241a4b83SYohann __longlong_as_double(assumed))); 36241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 37241a4b83SYohann // (since NaN != NaN) 38241a4b83SYohann } while (assumed != old); 39241a4b83SYohann return __longlong_as_double(old); 40241a4b83SYohann } 41f1a13f77SYohann Dudouit ); 42f1a13f77SYohann Dudouit 43f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 44f1a13f77SYohann Dudouit 45ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 46ab213215SJeremy L Thompson // Typedefs 47ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 48f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 49f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 50f1a13f77SYohann Dudouit 51f1a13f77SYohann Dudouit typedef struct { 52f1a13f77SYohann Dudouit CeedInt tidx; 53f1a13f77SYohann Dudouit CeedInt tidy; 54f1a13f77SYohann Dudouit CeedInt tidz; 55f1a13f77SYohann Dudouit CeedInt tid; 56f1a13f77SYohann Dudouit CeedScalar* slice; 57f1a13f77SYohann Dudouit } BackendData; 58241a4b83SYohann 59ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 60ab213215SJeremy L Thompson // Load matrices for basis actions 61ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 62241a4b83SYohann template <int P, int Q> 63241a4b83SYohann inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { 64ab213215SJeremy L Thompson for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 65241a4b83SYohann B[i] = d_B[i]; 66241a4b83SYohann } 67241a4b83SYohann 68ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 69241a4b83SYohann // 1D 70ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 71ab213215SJeremy L Thompson 72ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 73ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 74ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 755c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 765c7b696cSjeremylt inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 77ab213215SJeremy L Thompson if (data.tidx < P1d) { 784d537eeaSYohann const CeedInt node = data.tidx; 79ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 80ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 815c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 82241a4b83SYohann } 83241a4b83SYohann } 84920dcdc4Sjeremylt 85ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 86ab213215SJeremy L Thompson // L-vector -> E-vector, strided 87ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 88d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 89d80fc06aSjeremylt inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 90ab213215SJeremy L Thompson if (data.tidx < P1d) { 91ccedf6b0Sjeremylt const CeedInt node = data.tidx; 92d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 93ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 94d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 95ccedf6b0Sjeremylt } 96ccedf6b0Sjeremylt } 97241a4b83SYohann 98ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 99ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 100ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1015c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 1025c7b696cSjeremylt inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 103ab213215SJeremy L Thompson if (data.tidx < P1d) { 1044d537eeaSYohann const CeedInt node = data.tidx; 105ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 106ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 1075c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 108241a4b83SYohann } 109241a4b83SYohann } 110241a4b83SYohann 111ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 112ab213215SJeremy L Thompson // E-vector -> L-vector, strided 113ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 114d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 115d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 116ab213215SJeremy L Thompson if (data.tidx < P1d) { 117ccedf6b0Sjeremylt const CeedInt node = data.tidx; 118d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 119ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 120d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 121ccedf6b0Sjeremylt } 122ccedf6b0Sjeremylt } 123ccedf6b0Sjeremylt 124ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 125ab213215SJeremy L Thompson // 1D tensor contraction x 126ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 127241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 128ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 129241a4b83SYohann data.slice[data.tidx] = *U; 130241a4b83SYohann __syncthreads(); 131241a4b83SYohann *V = 0.0; 132ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 133ab213215SJeremy L Thompson *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 134241a4b83SYohann __syncthreads(); 135241a4b83SYohann } 136241a4b83SYohann 137ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 138ab213215SJeremy L Thompson // 1D transpose tensor contraction x 139ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 140241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 141ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 142241a4b83SYohann data.slice[data.tidx] = *U; 143241a4b83SYohann __syncthreads(); 144241a4b83SYohann *V = 0.0; 145ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 146ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 147241a4b83SYohann __syncthreads(); 148241a4b83SYohann } 149241a4b83SYohann 150ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 151ab213215SJeremy L Thompson // 1D interpolate to quadrature points 152ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 153241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 154ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 155ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 156241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 157241a4b83SYohann } 158241a4b83SYohann 159ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 160ab213215SJeremy L Thompson // 1D interpolate transpose 161ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 162241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 163ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 164ab213215SJeremy L Thompson for (CeedInt comp=0; comp<NCOMP; comp++) 165241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 166241a4b83SYohann } 167241a4b83SYohann 168ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 169ab213215SJeremy L Thompson // 1D derivatives at quadrature points 170ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 171241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 172ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 173ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 174241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 175241a4b83SYohann } 176241a4b83SYohann 177ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 178ab213215SJeremy L Thompson // 1D derivatives transpose 179ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 180241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 181ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 182ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 183241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 184241a4b83SYohann } 185241a4b83SYohann 186ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 187241a4b83SYohann // 2D 188ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 189ab213215SJeremy L Thompson 190ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 191ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 192ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1935c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 1945c7b696cSjeremylt inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 195ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 1964d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 197ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 198ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 1995c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 200241a4b83SYohann } 201241a4b83SYohann } 202920dcdc4Sjeremylt 203ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 204ab213215SJeremy L Thompson // L-vector -> E-vector, strided 205ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 206d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 207d80fc06aSjeremylt inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 208ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 209ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 210d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 211ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 212d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 213ccedf6b0Sjeremylt } 214ccedf6b0Sjeremylt } 215241a4b83SYohann 216ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 217ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 218ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 2195c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 2205c7b696cSjeremylt inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 221ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 2224d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 223ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 224ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2255c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 226241a4b83SYohann } 227241a4b83SYohann } 228241a4b83SYohann 229ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 230ab213215SJeremy L Thompson // E-vector -> L-vector, strided 231ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 232d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 233d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 234ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 235ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 236d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 237ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 238d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 239ccedf6b0Sjeremylt } 240ccedf6b0Sjeremylt } 241ccedf6b0Sjeremylt 242ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 243ab213215SJeremy L Thompson // 2D tensor contraction x 244ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 245241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 246ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 247241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 248241a4b83SYohann __syncthreads(); 249241a4b83SYohann *V = 0.0; 250ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 251ab213215SJeremy L Thompson *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 252241a4b83SYohann __syncthreads(); 253241a4b83SYohann } 254241a4b83SYohann 255ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 256ab213215SJeremy L Thompson // 2D tensor contract y 257ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 258241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 259ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 260241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 261241a4b83SYohann __syncthreads(); 262241a4b83SYohann *V = 0.0; 263ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 264ab213215SJeremy L Thompson *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction 265241a4b83SYohann __syncthreads(); 266241a4b83SYohann } 267241a4b83SYohann 268ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 269ab213215SJeremy L Thompson // 2D transpose tensor contract y 270ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 271241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 272ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 273241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 274241a4b83SYohann __syncthreads(); 275241a4b83SYohann *V = 0.0; 276ab213215SJeremy L Thompson if (data.tidy < P1d) 277ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 278ab213215SJeremy L Thompson *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction 279241a4b83SYohann __syncthreads(); 280241a4b83SYohann } 281241a4b83SYohann 282ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 283ab213215SJeremy L Thompson // 2D transpose tensor contract x 284ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 285241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 286ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 287241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 288241a4b83SYohann __syncthreads(); 289241a4b83SYohann *V = 0.0; 290ab213215SJeremy L Thompson if (data.tidx < P1d) 291ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 292ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 293241a4b83SYohann __syncthreads(); 294241a4b83SYohann } 295241a4b83SYohann 296ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 297ab213215SJeremy L Thompson // 2D transpose tensor contract and add x 298ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 299241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 300ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 301241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = *U; 302241a4b83SYohann __syncthreads(); 303ab213215SJeremy L Thompson if (data.tidx < P1d) 304ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 305ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction 306241a4b83SYohann __syncthreads(); 307241a4b83SYohann } 308241a4b83SYohann 309ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 310ab213215SJeremy L Thompson // 2D interpolate to quadrature points 311ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 312241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 313ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 314241a4b83SYohann CeedScalar r_t[1]; 315ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 316241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 317241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 318241a4b83SYohann } 319241a4b83SYohann } 320241a4b83SYohann 321ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 322ab213215SJeremy L Thompson // 2D interpolate transpose 323ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 324241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 325ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 326241a4b83SYohann CeedScalar r_t[1]; 327ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 328241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 329241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 330241a4b83SYohann } 331241a4b83SYohann } 332241a4b83SYohann 333ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 334ab213215SJeremy L Thompson // 2D derivatives at quadrature points 335ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 336241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 337ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 338241a4b83SYohann CeedScalar r_t[1]; 339ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 340241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 341241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 342241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 343241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 344241a4b83SYohann } 345241a4b83SYohann } 346241a4b83SYohann 347ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 348ab213215SJeremy L Thompson // 2D derivatives transpose 349ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 350241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 351ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 352241a4b83SYohann CeedScalar r_t[1]; 353ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 354241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 355241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 356241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 357241a4b83SYohann ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 358241a4b83SYohann } 359241a4b83SYohann } 360241a4b83SYohann 361ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 362241a4b83SYohann // 3D 363ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 364ab213215SJeremy L Thompson 365ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 366ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 367ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3685c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 3695c7b696cSjeremylt inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 370ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 371241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3724d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 373ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 374ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 3755c7b696cSjeremylt r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 376241a4b83SYohann } 377241a4b83SYohann } 378920dcdc4Sjeremylt 379ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 380ab213215SJeremy L Thompson // L-vector -> E-vector, strided 381ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 382d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 383d80fc06aSjeremylt inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 384ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 385ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 386ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 387d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 388ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 389d80fc06aSjeremylt r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 390ccedf6b0Sjeremylt } 391ccedf6b0Sjeremylt } 392241a4b83SYohann 393ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 394ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided 395ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3965c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d> 3975c7b696cSjeremylt inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 398ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 399920dcdc4Sjeremylt const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 400ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4015c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 402ac421f39SYohann } 403ac421f39SYohann 404ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 405ab213215SJeremy L Thompson // E-vector -> Q-vector, strided 406ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 407d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 40825dfc19dSjeremylt inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 409920dcdc4Sjeremylt const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 410d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 411ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 412d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 413920dcdc4Sjeremylt } 414920dcdc4Sjeremylt 415ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 416ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 417ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4185c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 4195c7b696cSjeremylt inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 420ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 421241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4224d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 423ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 424ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4255c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 426241a4b83SYohann } 427241a4b83SYohann } 428241a4b83SYohann 429ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 430ab213215SJeremy L Thompson // E-vector -> L-vector, strided 431ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 432d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4332f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 434ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 435ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 436ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 437d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 438ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 439d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 440ccedf6b0Sjeremylt } 441ccedf6b0Sjeremylt } 442ccedf6b0Sjeremylt 443ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 444ab213215SJeremy L Thompson // 3D tensor contract x 445ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 446241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 447ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 448ac421f39SYohann CeedScalar r_B[P1d]; 449ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 450ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 451ab213215SJeremy L Thompson 452ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 453241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 454241a4b83SYohann __syncthreads(); 455241a4b83SYohann V[k] = 0.0; 456ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 457ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 458241a4b83SYohann __syncthreads(); 459241a4b83SYohann } 460241a4b83SYohann } 461241a4b83SYohann 462ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 463ab213215SJeremy L Thompson // 3D tensor contract y 464ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 465241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 466ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 467ac421f39SYohann CeedScalar r_B[P1d]; 468ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 469ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 470ab213215SJeremy L Thompson 471ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 472241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 473241a4b83SYohann __syncthreads(); 474241a4b83SYohann V[k] = 0.0; 475ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 476ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 477241a4b83SYohann __syncthreads(); 478241a4b83SYohann } 479241a4b83SYohann } 480241a4b83SYohann 481ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 482ab213215SJeremy L Thompson // 3D tensor contract z 483ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 484241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 485ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 486ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 487241a4b83SYohann V[k] = 0.0; 488ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 489ab213215SJeremy L Thompson V[k] += B[i + k*P1d] * U[i]; // Contract z direction 490241a4b83SYohann } 491241a4b83SYohann } 492241a4b83SYohann 493ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 494ab213215SJeremy L Thompson // 3D transpose tensor contract z 495ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 496241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 497ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 498ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 499241a4b83SYohann V[k] = 0.0; 500ab213215SJeremy L Thompson if (k < P1d) 501ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 502ab213215SJeremy L Thompson V[k] += B[k + i*P1d] * U[i]; // Contract z direction 503241a4b83SYohann } 504241a4b83SYohann } 505241a4b83SYohann 506ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 507ab213215SJeremy L Thompson // 3D transpose tensor contract y 508ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 509241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 510ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 511ac421f39SYohann CeedScalar r_B[Q1d]; 512ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 513ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 514ab213215SJeremy L Thompson 515ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 516241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 517241a4b83SYohann __syncthreads(); 518241a4b83SYohann V[k] = 0.0; 519ab213215SJeremy L Thompson if (data.tidy < P1d) 520ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 521ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 522ac421f39SYohann __syncthreads(); 523ac421f39SYohann } 524ac421f39SYohann } 525ac421f39SYohann 526ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 527ab213215SJeremy L Thompson // 3D transpose tensor contract add y 528ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 529ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 530ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 531ac421f39SYohann CeedScalar r_B[Q1d]; 532ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 533ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 534ab213215SJeremy L Thompson 535ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 536ac421f39SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 537ac421f39SYohann __syncthreads(); 538ab213215SJeremy L Thompson if (data.tidy < P1d) 539ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 540ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction 541241a4b83SYohann __syncthreads(); 542241a4b83SYohann } 543241a4b83SYohann } 544241a4b83SYohann 545ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 546ab213215SJeremy L Thompson // 3D transpose tensor contract x 547ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 548241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 549ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 550ac421f39SYohann CeedScalar r_B[Q1d]; 551ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 552ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 553ab213215SJeremy L Thompson 554ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 555241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 556241a4b83SYohann __syncthreads(); 557241a4b83SYohann V[k] = 0.0; 558ab213215SJeremy L Thompson if (data.tidx < P1d) 559ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 560ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 561241a4b83SYohann __syncthreads(); 562241a4b83SYohann } 563241a4b83SYohann } 564241a4b83SYohann 565ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 566ab213215SJeremy L Thompson // 3D transpose tensor contract add x 567ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 568241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 569ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 570ac421f39SYohann CeedScalar r_B[Q1d]; 571ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 572ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 573ab213215SJeremy L Thompson 574ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 575241a4b83SYohann data.slice[data.tidx+data.tidy*Q1d] = U[k]; 576241a4b83SYohann __syncthreads(); 577ab213215SJeremy L Thompson if (data.tidx < P1d) 578ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 579ab213215SJeremy L Thompson V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction 580241a4b83SYohann __syncthreads(); 581241a4b83SYohann } 582241a4b83SYohann } 583241a4b83SYohann 584ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 585ab213215SJeremy L Thompson // 3D interpolate to quadrature points 586ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 587241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 588ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 589241a4b83SYohann CeedScalar r_t1[Q1d]; 590241a4b83SYohann CeedScalar r_t2[Q1d]; 591ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 592241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 593241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 594241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 595241a4b83SYohann } 596241a4b83SYohann } 597241a4b83SYohann 598ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 599ab213215SJeremy L Thompson // 3D interpolate transpose 600ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 601241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 602ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 603241a4b83SYohann CeedScalar r_t1[Q1d]; 604241a4b83SYohann CeedScalar r_t2[Q1d]; 605ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 606241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 607241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 608241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 609241a4b83SYohann } 610241a4b83SYohann } 611241a4b83SYohann 612ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 613ab213215SJeremy L Thompson // 3D derivatives at quadrature points 614ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 615241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 616ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 617241a4b83SYohann CeedScalar r_t1[Q1d]; 618241a4b83SYohann CeedScalar r_t2[Q1d]; 619ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 620241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 621241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 622241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 623241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 624241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 625241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 626241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 627241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 628241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 629241a4b83SYohann } 630241a4b83SYohann } 631241a4b83SYohann 632ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 633ab213215SJeremy L Thompson // 3D derivatives transpose 634ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 635241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 636ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 637241a4b83SYohann CeedScalar r_t1[Q1d]; 638241a4b83SYohann CeedScalar r_t2[Q1d]; 639ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 640241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 641241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 642241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 643241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 644241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 645241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 646241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 647241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 648241a4b83SYohann ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 649241a4b83SYohann } 650241a4b83SYohann } 651241a4b83SYohann 652ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 653ab213215SJeremy L Thompson // 3D collocated derivatives computation 654ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 655ac421f39SYohann template <int NCOMP, int Q1d> 656ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 657ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 658ac421f39SYohann data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d]; 659ac421f39SYohann __syncthreads(); 660ac421f39SYohann // X derivative 661ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 662ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 663ab213215SJeremy L Thompson r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative) 664ac421f39SYohann // Y derivative 665ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 666ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 667ab213215SJeremy L Thompson r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative) 668ac421f39SYohann // Z derivative 669ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 670ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 671ab213215SJeremy L Thompson r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 672ac421f39SYohann __syncthreads(); 673ac421f39SYohann } 674ac421f39SYohann } 675ac421f39SYohann 676ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 677ab213215SJeremy L Thompson // 3D collocated derivatives transpose 678ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 679ac421f39SYohann template <int NCOMP, int Q1d> 680ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 681ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 682ac421f39SYohann // X derivative 683ac421f39SYohann data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP]; 684ac421f39SYohann __syncthreads(); 685ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 686ab213215SJeremy L Thompson r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative) 687ac421f39SYohann __syncthreads(); 688ac421f39SYohann // Y derivative 689ac421f39SYohann data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP]; 690ac421f39SYohann __syncthreads(); 691ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 692ab213215SJeremy L Thompson r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative) 693ac421f39SYohann __syncthreads(); 694ac421f39SYohann // Z derivative 695ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 696ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 697ac421f39SYohann } 698ac421f39SYohann } 699ac421f39SYohann 700ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 701ab213215SJeremy L Thompson // 1D quadrature weights 702ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 703241a4b83SYohann template <int Q1d> 704241a4b83SYohann inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 705241a4b83SYohann *w = qweight1d[data.tidx]; 706241a4b83SYohann } 707241a4b83SYohann 708ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 709ab213215SJeremy L Thompson // 2D quadrature weights 710ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 711241a4b83SYohann template <int Q1d> 712241a4b83SYohann inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 713241a4b83SYohann *w = qweight1d[data.tidx]*qweight1d[data.tidy]; 714241a4b83SYohann } 715241a4b83SYohann 716ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 717ab213215SJeremy L Thompson // 3D quadrature weights 718ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 719241a4b83SYohann template <int Q1d> 720241a4b83SYohann inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { 721241a4b83SYohann const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; 722ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 723241a4b83SYohann w[z] = pw*qweight1d[z]; 724241a4b83SYohann } 725241a4b83SYohann 726241a4b83SYohann ); 727ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 728ab213215SJeremy L Thompson // Build singe operator kernel 729ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 730241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 731241a4b83SYohann 732241a4b83SYohann using std::ostringstream; 733241a4b83SYohann using std::string; 734241a4b83SYohann int ierr; 735241a4b83SYohann bool setupdone; 736fd364f38SJeremy L Thompson ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr); 737241a4b83SYohann if (setupdone) return 0; 738241a4b83SYohann Ceed ceed; 739241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 740241a4b83SYohann CeedOperator_Cuda_gen *data; 741241a4b83SYohann ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); 742241a4b83SYohann CeedQFunction qf; 743241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 744241a4b83SYohann ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); 745241a4b83SYohann ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); 7461da99368SJeremy L Thompson CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, 7475c7b696cSjeremylt numoutputfields, ncomp, dim = 0, lsize; 748241a4b83SYohann ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); 749241a4b83SYohann ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); 750241a4b83SYohann ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); 751241a4b83SYohann CeedChk(ierr); 752241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 753241a4b83SYohann ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); 754241a4b83SYohann CeedChk(ierr); 755241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 756241a4b83SYohann ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); 757241a4b83SYohann CeedChk(ierr); 758241a4b83SYohann CeedEvalMode emode; 759241a4b83SYohann CeedBasis basis; 760241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 761241a4b83SYohann CeedElemRestriction Erestrict; 762241a4b83SYohann CeedElemRestriction_Cuda_reg *restr_data; 763241a4b83SYohann 764241a4b83SYohann ostringstream code; 765241a4b83SYohann string devFunctions(deviceFunctions); 766241a4b83SYohann 767f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 768f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 769f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 770abfaacbbSSander Arens ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr); 771f1a13f77SYohann Dudouit ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); 772f1a13f77SYohann Dudouit if (prop.major<6){ 773f1a13f77SYohann Dudouit code << atomicAdd; 774f1a13f77SYohann Dudouit } 775f1a13f77SYohann Dudouit 776241a4b83SYohann code << devFunctions; 777241a4b83SYohann 778241a4b83SYohann string qFunction(qf_data->qFunctionSource); 779*c3c443faSYohann Dudouit string qFunctionName(qf_data->qFunctionName); 780*c3c443faSYohann Dudouit string oper; 781*c3c443faSYohann Dudouit oper = "kernel_" + qFunctionName; 7824d537eeaSYohann 7834d537eeaSYohann code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; 784ee07ded2SValeria Barra code << "\n#define CeedPragmaSIMD\n"; 7851da99368SJeremy L Thompson 7861da99368SJeremy L Thompson // Find dim and Q1d 7871da99368SJeremy L Thompson bool collograd = false; 7881da99368SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 7891da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 7901da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 7911da99368SJeremy L Thompson ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 7921da99368SJeremy L Thompson 7931da99368SJeremy L Thompson // Check for collocated gradient 7941da99368SJeremy L Thompson if (basis_data->d_collograd1d) 7951da99368SJeremy L Thompson collograd = true; 7961da99368SJeremy L Thompson 7971da99368SJeremy L Thompson // Collect dim and Q1d 7981da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 7991da99368SJeremy L Thompson bool isTensor; 800fd364f38SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 8011da99368SJeremy L Thompson if (isTensor) { 8021da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 8031da99368SJeremy L Thompson } else { 8041da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 8051da99368SJeremy L Thompson } 8061da99368SJeremy L Thompson } 8071da99368SJeremy L Thompson } 8081da99368SJeremy L Thompson // Check output bases for Q1d, dim as well 8091da99368SJeremy L Thompson // The only imput basis might be CEED_BASIS_COLLOCATED 8101da99368SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 8111da99368SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 8121da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 8131da99368SJeremy L Thompson ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 8141da99368SJeremy L Thompson // Collect dim and Q1d 8151da99368SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); 8161da99368SJeremy L Thompson bool isTensor; 817fd364f38SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); 8181da99368SJeremy L Thompson if (isTensor) { 8191da99368SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); 8201da99368SJeremy L Thompson } else { 8211da99368SJeremy L Thompson return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); 8221da99368SJeremy L Thompson } 8231da99368SJeremy L Thompson } 8241da99368SJeremy L Thompson } 8251da99368SJeremy L Thompson data->dim = dim; 8261da99368SJeremy L Thompson data->Q1d = Q1d; 8271da99368SJeremy L Thompson 8281da99368SJeremy L Thompson // Define CEED_Q_VLA 8291da99368SJeremy L Thompson if (dim != 3 || collograd) { 8301da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA 1\n\n"; 8311da99368SJeremy L Thompson } else { 8321da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 8331da99368SJeremy L Thompson } 8341da99368SJeremy L Thompson 835241a4b83SYohann code << qFunction; 836241a4b83SYohann 837241a4b83SYohann // Setup 838*c3c443faSYohann Dudouit code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 839241a4b83SYohann // Input Evecs and Restriction 840241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 841241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 842241a4b83SYohann CeedChk(ierr); 8431da99368SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 844241a4b83SYohann code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 845241a4b83SYohann } 846241a4b83SYohann } 847241a4b83SYohann 848241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 849241a4b83SYohann code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 850241a4b83SYohann } 8511da99368SJeremy L Thompson 852241a4b83SYohann code << "const CeedInt Dim = "<<dim<<";\n"; 853241a4b83SYohann code << "const CeedInt Q1d = "<<Q1d<<";\n"; 8541da99368SJeremy L Thompson 855241a4b83SYohann code << "extern __shared__ CeedScalar slice[];\n"; 856241a4b83SYohann code << "BackendData data;\n"; 857241a4b83SYohann code << "data.tidx = threadIdx.x;\n"; 858241a4b83SYohann code << "data.tidy = threadIdx.y;\n"; 859241a4b83SYohann code << "data.tidz = threadIdx.z;\n"; 860241a4b83SYohann code << "data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 861241a4b83SYohann code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n"; 862920dcdc4Sjeremylt 863920dcdc4Sjeremylt code << "\n// Input field constants and basis data\n"; 864ac421f39SYohann //Initialize constants, and matrices B and G 865241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 866920dcdc4Sjeremylt code << "// -- Input field "<<i<<" --\n"; 867241a4b83SYohann // Get elemsize, emode, ncomp 868241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 869241a4b83SYohann CeedChk(ierr); 870241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 871241a4b83SYohann CeedChk(ierr); 872241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 873241a4b83SYohann CeedChk(ierr); 8744d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 875241a4b83SYohann CeedChk(ierr); 876920dcdc4Sjeremylt 877920dcdc4Sjeremylt // Set field constants 878920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT) { 879920dcdc4Sjeremylt ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 880920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 881920dcdc4Sjeremylt ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 882920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 883920dcdc4Sjeremylt } else { 884920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 885920dcdc4Sjeremylt } 886920dcdc4Sjeremylt code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 887920dcdc4Sjeremylt } 888920dcdc4Sjeremylt 889920dcdc4Sjeremylt // Load basis data 890920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 891241a4b83SYohann switch (emode) { 892241a4b83SYohann case CEED_EVAL_NONE: 893241a4b83SYohann break; 894241a4b83SYohann case CEED_EVAL_INTERP: 895241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 896241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 897241a4b83SYohann code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 898241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 899241a4b83SYohann break; 900241a4b83SYohann case CEED_EVAL_GRAD: 901241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 902241a4b83SYohann data->B.in[i] = basis_data->d_interp1d; 903241a4b83SYohann code << "__shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 904241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 905ac421f39SYohann if (basis_data->d_collograd1d) { 906ac421f39SYohann data->G.in[i] = basis_data->d_collograd1d; 907ac421f39SYohann code << "__shared__ double s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 908ac421f39SYohann code << "loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 909ac421f39SYohann } else { 910ac421f39SYohann data->G.in[i] = basis_data->d_grad1d; 911ac421f39SYohann code << "__shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 912241a4b83SYohann code << "loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 913ac421f39SYohann } 914241a4b83SYohann break; 915241a4b83SYohann case CEED_EVAL_WEIGHT: 916241a4b83SYohann break; // No action 917241a4b83SYohann case CEED_EVAL_DIV: 918241a4b83SYohann break; // TODO: Not implemented 919241a4b83SYohann case CEED_EVAL_CURL: 920241a4b83SYohann break; // TODO: Not implemented 921241a4b83SYohann } 922241a4b83SYohann } 923920dcdc4Sjeremylt 924920dcdc4Sjeremylt code << "\n// Output field constants and basis data\n"; 925241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 926920dcdc4Sjeremylt code << "// -- Output field "<<i<<" --\n"; 927241a4b83SYohann // Get elemsize, emode, ncomp 928241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 929241a4b83SYohann CeedChk(ierr); 930241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 931241a4b83SYohann CeedChk(ierr); 932241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 933241a4b83SYohann CeedChk(ierr); 9344d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 935241a4b83SYohann CeedChk(ierr); 936920dcdc4Sjeremylt 937920dcdc4Sjeremylt // Set field constants 938241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); 939920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 940241a4b83SYohann ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr); 941241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 942920dcdc4Sjeremylt } else { 943920dcdc4Sjeremylt code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 944920dcdc4Sjeremylt } 945920dcdc4Sjeremylt code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 946920dcdc4Sjeremylt 947920dcdc4Sjeremylt // Load basis data 948920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 949920dcdc4Sjeremylt switch (emode) { 950920dcdc4Sjeremylt case CEED_EVAL_NONE: 951920dcdc4Sjeremylt break; // No action 952920dcdc4Sjeremylt case CEED_EVAL_INTERP: 953241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 954241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 955241a4b83SYohann code << " __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 956241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 957241a4b83SYohann break; 958241a4b83SYohann case CEED_EVAL_GRAD: 959241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 960241a4b83SYohann data->B.out[i] = basis_data->d_interp1d; 961241a4b83SYohann code << "__shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 962241a4b83SYohann code << "loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 963ac421f39SYohann if (basis_data->d_collograd1d) { 964ac421f39SYohann data->G.out[i] = basis_data->d_collograd1d; 965ac421f39SYohann code << "__shared__ double s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 966ac421f39SYohann code << "loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 967ac421f39SYohann } else { 968ac421f39SYohann data->G.out[i] = basis_data->d_grad1d; 969ac421f39SYohann code << "__shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 970241a4b83SYohann code << "loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 971ac421f39SYohann } 972241a4b83SYohann break; 973241a4b83SYohann case CEED_EVAL_WEIGHT: { 974241a4b83SYohann Ceed ceed; 975241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 976241a4b83SYohann return CeedError(ceed, 1, 977241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 978241a4b83SYohann break; // Should not occur 979241a4b83SYohann } 980241a4b83SYohann case CEED_EVAL_DIV: 981241a4b83SYohann break; // TODO: Not implemented 982241a4b83SYohann case CEED_EVAL_CURL: 983241a4b83SYohann break; // TODO: Not implemented 984241a4b83SYohann } 985241a4b83SYohann } 986ac421f39SYohann code << "\n"; 987ac421f39SYohann code << "__syncthreads();\n"; 988241a4b83SYohann code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 989241a4b83SYohann // Input basis apply if needed 990ac421f39SYohann // Generate the correct eval mode code for each input 991920dcdc4Sjeremylt code << "\n// Input field restrictions and basis actions\n"; 992241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 993920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 994241a4b83SYohann // Get elemsize, emode, ncomp 995241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 996241a4b83SYohann CeedChk(ierr); 997241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 998241a4b83SYohann CeedChk(ierr); 999241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1000241a4b83SYohann CeedChk(ierr); 10014d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1002241a4b83SYohann CeedChk(ierr); 1003920dcdc4Sjeremylt 1004920dcdc4Sjeremylt // Restriction 1005920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT && 1006920dcdc4Sjeremylt !((emode == CEED_EVAL_NONE) && basis_data->d_collograd1d)) { 1007241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 10089a2291e3SJeremy L Thompson 10099a2291e3SJeremy L Thompson bool isStrided; 1010fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 10119a2291e3SJeremy L Thompson if (!isStrided) { 10125c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 10135c7b696cSjeremylt CeedChk(ierr); 10145c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 10155c7b696cSjeremylt CeedInt compstride; 10165c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 10175c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 10189a2291e3SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 10199a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 10205c7b696cSjeremylt code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1021ccedf6b0Sjeremylt } else { 1022920dcdc4Sjeremylt bool backendstrides; 1023fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1024920dcdc4Sjeremylt CeedChk(ierr); 1025920dcdc4Sjeremylt CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1026920dcdc4Sjeremylt if (!backendstrides) { 1027920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1028920dcdc4Sjeremylt CeedChk(ierr); 1029ccedf6b0Sjeremylt } 1030920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1031d80fc06aSjeremylt code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1032920dcdc4Sjeremylt } 1033920dcdc4Sjeremylt } 1034920dcdc4Sjeremylt 1035920dcdc4Sjeremylt // Basis action 1036920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1037920dcdc4Sjeremylt switch (emode) { 1038920dcdc4Sjeremylt case CEED_EVAL_NONE: 1039920dcdc4Sjeremylt if (!basis_data->d_collograd1d) { 1040920dcdc4Sjeremylt code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1041920dcdc4Sjeremylt } 1042920dcdc4Sjeremylt break; 1043920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1044241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1045241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1046241a4b83SYohann break; 1047241a4b83SYohann case CEED_EVAL_GRAD: 1048ac421f39SYohann if (basis_data->d_collograd1d) { 1049ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1050ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1051ac421f39SYohann } else { 1052241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1053241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1054ac421f39SYohann } 1055241a4b83SYohann break; 1056241a4b83SYohann case CEED_EVAL_WEIGHT: 1057241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1058241a4b83SYohann ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); 1059241a4b83SYohann ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); 1060241a4b83SYohann data->W = basis_data->d_qweight1d; 1061241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1062241a4b83SYohann break; // No action 1063241a4b83SYohann case CEED_EVAL_DIV: 1064241a4b83SYohann break; // TODO: Not implemented 1065241a4b83SYohann case CEED_EVAL_CURL: 1066241a4b83SYohann break; // TODO: Not implemented 1067241a4b83SYohann } 1068241a4b83SYohann } 1069ac421f39SYohann 1070241a4b83SYohann // Q function 1071920dcdc4Sjeremylt code << "\n// Output field setup\n"; 1072241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1073920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1074241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1075241a4b83SYohann CeedChk(ierr); 1076241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1077241a4b83SYohann { 1078ac421f39SYohann if (basis_data->d_collograd1d) { 1079ac421f39SYohann //Accumulator for gradient slices 1080ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1081ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1082ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1083ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1084ac421f39SYohann code << " }\n"; 1085ac421f39SYohann code << " }\n"; 1086ac421f39SYohann } else { 1087241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1088241a4b83SYohann } 1089ac421f39SYohann } 1090241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1091241a4b83SYohann { 1092241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1093241a4b83SYohann } 1094241a4b83SYohann } 1095ac421f39SYohann // We treat quadrature points per slice in 3d to save registers 1096ac421f39SYohann if (basis_data->d_collograd1d) { 1097920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1098ac421f39SYohann code << "#pragma unroll\n"; 1099ac421f39SYohann code << "for (CeedInt q=0; q<Q1d; q++) {\n"; 1100ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1101920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 1102ac421f39SYohann // Get elemsize, emode, ncomp 1103ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1104ac421f39SYohann CeedChk(ierr); 1105ac421f39SYohann // Basis action 1106920dcdc4Sjeremylt code << "// EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1107ac421f39SYohann switch (emode) { 1108ac421f39SYohann case CEED_EVAL_NONE: 1109ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 11109a2291e3SJeremy L Thompson 11119a2291e3SJeremy L Thompson bool isStrided; 1112fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 11139a2291e3SJeremy L Thompson if (!isStrided) { 11145c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 11155c7b696cSjeremylt CeedChk(ierr); 11165c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 11175c7b696cSjeremylt CeedInt compstride; 11185c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 11195c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 11209a2291e3SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 11219a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 11225c7b696cSjeremylt code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1123920dcdc4Sjeremylt } else { 1124920dcdc4Sjeremylt bool backendstrides; 1125fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1126920dcdc4Sjeremylt CeedChk(ierr); 1127920dcdc4Sjeremylt CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1128920dcdc4Sjeremylt if (!backendstrides) { 1129920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1130920dcdc4Sjeremylt CeedChk(ierr); 1131920dcdc4Sjeremylt } 1132920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1133d80fc06aSjeremylt code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1134920dcdc4Sjeremylt } 1135ac421f39SYohann break; 1136ac421f39SYohann case CEED_EVAL_INTERP: 1137ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1138ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1139ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1140ac421f39SYohann code << " }\n"; 1141ac421f39SYohann break; 1142ac421f39SYohann case CEED_EVAL_GRAD: 1143ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1144ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1145ac421f39SYohann break; 1146ac421f39SYohann case CEED_EVAL_WEIGHT: 1147ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1148ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1149ac421f39SYohann break; // No action 1150ac421f39SYohann case CEED_EVAL_DIV: 1151ac421f39SYohann break; // TODO: Not implemented 1152ac421f39SYohann case CEED_EVAL_CURL: 1153ac421f39SYohann break; // TODO: Not implemented 1154ac421f39SYohann } 1155ac421f39SYohann } 1156ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1157920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1158ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1159ac421f39SYohann CeedChk(ierr); 1160ac421f39SYohann // Basis action 1161ac421f39SYohann switch (emode) { 1162ac421f39SYohann case CEED_EVAL_NONE: 1163ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1164ac421f39SYohann break; // No action 1165ac421f39SYohann case CEED_EVAL_INTERP: 1166ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1167ac421f39SYohann break; 1168ac421f39SYohann case CEED_EVAL_GRAD: 1169ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1170ac421f39SYohann break; 1171ac421f39SYohann case CEED_EVAL_WEIGHT: 1172ac421f39SYohann break; // Should not occur 1173ac421f39SYohann case CEED_EVAL_DIV: 1174ac421f39SYohann break; // TODO: Not implemented 1175ac421f39SYohann case CEED_EVAL_CURL: 1176ac421f39SYohann break; // TODO: Not implemented 1177ac421f39SYohann } 1178ac421f39SYohann } 1179ac421f39SYohann } else { 1180920dcdc4Sjeremylt code << "\n // Note: No Collocated Gradient\n"; 1181ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1182920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 1183ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1184ac421f39SYohann } 1185ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1186920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1187ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1188ac421f39SYohann } 1189ac421f39SYohann } 1190920dcdc4Sjeremylt code << " // QFunction Inputs and outputs\n"; 11914d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 11924d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1193920dcdc4Sjeremylt code << " // -- Input field "<<i<<" --\n"; 1194ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 11954d537eeaSYohann } 11964d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 11974d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1198920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1199ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 12004d537eeaSYohann } 1201920dcdc4Sjeremylt code << "\n // Apply QFunction\n"; 1202ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 1203ac421f39SYohann if (dim != 3 || basis_data->d_collograd1d) { 1204ac421f39SYohann code << "1 "; 1205ac421f39SYohann } else { 1206ac421f39SYohann code << "Q1d "; 1207ac421f39SYohann } 1208ac421f39SYohann code << ", in, out);\n"; 1209ac421f39SYohann if (basis_data->d_collograd1d) { 1210920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1211ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1212920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1213ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1214ac421f39SYohann CeedChk(ierr); 1215ac421f39SYohann // Basis action 1216920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1217ac421f39SYohann switch (emode) { 1218ac421f39SYohann case CEED_EVAL_NONE: 1219ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1220ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1221ac421f39SYohann code << " }\n"; 1222ac421f39SYohann break; // No action 1223ac421f39SYohann case CEED_EVAL_INTERP: 1224ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1225ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1226ac421f39SYohann code << " }\n"; 1227ac421f39SYohann break; 1228ac421f39SYohann case CEED_EVAL_GRAD: 1229ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1230ac421f39SYohann break; 1231ac421f39SYohann case CEED_EVAL_WEIGHT: 1232ac421f39SYohann break; // Should not occur 1233ac421f39SYohann case CEED_EVAL_DIV: 1234ac421f39SYohann break; // TODO: Not implemented 1235ac421f39SYohann case CEED_EVAL_CURL: 1236ac421f39SYohann break; // TODO: Not implemented 1237ac421f39SYohann } 1238ac421f39SYohann } 1239ac421f39SYohann code << "}\n"; 1240ac421f39SYohann } 1241241a4b83SYohann 1242241a4b83SYohann // Output basis apply if needed 1243ac421f39SYohann // Generate the correct eval mode code for each output 1244920dcdc4Sjeremylt code << "\n// Output field basis action and restrictions\n"; 1245241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1246920dcdc4Sjeremylt code << " // -- Output field "<<i<<" --\n"; 1247241a4b83SYohann // Get elemsize, emode, ncomp 1248241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1249241a4b83SYohann CeedChk(ierr); 1250241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1251241a4b83SYohann CeedChk(ierr); 1252241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1253241a4b83SYohann CeedChk(ierr); 12544d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1255241a4b83SYohann CeedChk(ierr); 1256241a4b83SYohann // Basis action 1257920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1258241a4b83SYohann switch (emode) { 1259241a4b83SYohann case CEED_EVAL_NONE: 1260920dcdc4Sjeremylt code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1261241a4b83SYohann break; // No action 1262241a4b83SYohann case CEED_EVAL_INTERP: 1263241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1264241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1265241a4b83SYohann break; 1266241a4b83SYohann case CEED_EVAL_GRAD: 1267241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1268ac421f39SYohann if (basis_data->d_collograd1d) { 1269ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1270ac421f39SYohann } else { 1271241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1272ac421f39SYohann } 1273241a4b83SYohann break; 1274241a4b83SYohann case CEED_EVAL_WEIGHT: { 1275241a4b83SYohann Ceed ceed; 1276241a4b83SYohann ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); 1277241a4b83SYohann return CeedError(ceed, 1, 1278241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1279241a4b83SYohann break; // Should not occur 1280241a4b83SYohann } 1281241a4b83SYohann case CEED_EVAL_DIV: 1282241a4b83SYohann break; // TODO: Not implemented 1283241a4b83SYohann case CEED_EVAL_CURL: 1284241a4b83SYohann break; // TODO: Not implemented 1285241a4b83SYohann } 1286920dcdc4Sjeremylt // Restriction 12879a2291e3SJeremy L Thompson bool isStrided; 1288fd364f38SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChk(ierr); 12899a2291e3SJeremy L Thompson if (!isStrided) { 12905c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 12915c7b696cSjeremylt CeedChk(ierr); 12925c7b696cSjeremylt code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 12935c7b696cSjeremylt CeedInt compstride; 12945c7b696cSjeremylt ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChk(ierr); 12955c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 12969a2291e3SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr); 12979a2291e3SJeremy L Thompson data->indices.out[i] = restr_data->d_ind; 12985c7b696cSjeremylt code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1299920dcdc4Sjeremylt } else { 1300920dcdc4Sjeremylt bool backendstrides; 1301fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1302920dcdc4Sjeremylt CeedChk(ierr); 1303920dcdc4Sjeremylt CeedInt strides[3] = {1, elemsize, elemsize*ncomp}; 1304920dcdc4Sjeremylt if (!backendstrides) { 1305920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1306920dcdc4Sjeremylt CeedChk(ierr); 1307920dcdc4Sjeremylt } 1308920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1309d80fc06aSjeremylt code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1310920dcdc4Sjeremylt } 1311241a4b83SYohann } 1312241a4b83SYohann 1313241a4b83SYohann code << " }\n"; 1314241a4b83SYohann code << "}\n\n"; 1315241a4b83SYohann 1316ab213215SJeremy L Thompson // View kernel for debugging 1317241a4b83SYohann // std::cout << code.str(); 1318241a4b83SYohann 1319920dcdc4Sjeremylt ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); 1320920dcdc4Sjeremylt CeedChk(ierr); 1321*c3c443faSYohann Dudouit ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1322241a4b83SYohann CeedChk(ierr); 1323241a4b83SYohann 1324241a4b83SYohann ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); 1325241a4b83SYohann return 0; 1326241a4b83SYohann } 1327ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1328