13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3241a4b83SYohann // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5241a4b83SYohann // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 73d576824SJeremy L Thompson 8b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12 97df94212SJeremy L Thompson 10ec3da8bcSJed Brown #include <ceed/ceed.h> 11ec3da8bcSJed Brown #include <ceed/backend.h> 123d576824SJeremy L Thompson #include <cuda_runtime.h> 13241a4b83SYohann #include <iostream> 14241a4b83SYohann #include <sstream> 153d576824SJeremy L Thompson #include "ceed-cuda-gen.h" 166d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 170d0321e0SJeremy L Thompson #include "../cuda-ref/ceed-cuda-ref.h" 18241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 19241a4b83SYohann 20f1a13f77SYohann Dudouit static const char *atomicAdd = QUOTE( 21ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 22ab213215SJeremy L Thompson // Atomic add, for older CUDA 23ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 2480a9ef05SNatalie Beams __device__ CeedScalar atomicAdd(CeedScalar *address, CeedScalar val) { 25241a4b83SYohann unsigned long long int *address_as_ull = (unsigned long long int *)address; 26241a4b83SYohann unsigned long long int old = *address_as_ull, assumed; 27241a4b83SYohann do { 28241a4b83SYohann assumed = old; 29241a4b83SYohann old = 30241a4b83SYohann atomicCAS(address_as_ull, assumed, 31241a4b83SYohann __double_as_longlong(val + 32241a4b83SYohann __longlong_as_double(assumed))); 33241a4b83SYohann // Note: uses integer comparison to avoid hang in case of NaN 34241a4b83SYohann // (since NaN != NaN) 35241a4b83SYohann } while (assumed != old); 36241a4b83SYohann return __longlong_as_double(old); 37241a4b83SYohann } 38f1a13f77SYohann Dudouit ); 39f1a13f77SYohann Dudouit 40f1a13f77SYohann Dudouit static const char *deviceFunctions = QUOTE( 41f1a13f77SYohann Dudouit 42ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 43ab213215SJeremy L Thompson // Typedefs 44ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 45f1a13f77SYohann Dudouit typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; 46f1a13f77SYohann Dudouit typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; 47f1a13f77SYohann Dudouit 48f1a13f77SYohann Dudouit typedef struct { 49f1a13f77SYohann Dudouit CeedInt tidx; 50f1a13f77SYohann Dudouit CeedInt tidy; 51f1a13f77SYohann Dudouit CeedInt tidz; 52f1a13f77SYohann Dudouit CeedInt tid; 53f1a13f77SYohann Dudouit CeedScalar* slice; 54f1a13f77SYohann Dudouit } BackendData; 55241a4b83SYohann 56ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 57ab213215SJeremy L Thompson // Load matrices for basis actions 58ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 59241a4b83SYohann template <int P, int Q> 60053543deSYohann Dudouit inline __device__ void loadMatrix(BackendData &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 61ab213215SJeremy L Thompson for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 62241a4b83SYohann B[i] = d_B[i]; 63241a4b83SYohann } 64241a4b83SYohann 65ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 66241a4b83SYohann // 1D 67ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 68ab213215SJeremy L Thompson 69ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 70ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 71ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 725c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 73053543deSYohann Dudouit inline __device__ void readDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 74ab213215SJeremy L Thompson if (data.tidx < P1d) { 754d537eeaSYohann const CeedInt node = data.tidx; 76ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 77ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 785c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 79241a4b83SYohann } 80241a4b83SYohann } 81920dcdc4Sjeremylt 82ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 83ab213215SJeremy L Thompson // L-vector -> E-vector, strided 84ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 85d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 86053543deSYohann Dudouit inline __device__ void readDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 87ab213215SJeremy L Thompson if (data.tidx < P1d) { 88ccedf6b0Sjeremylt const CeedInt node = data.tidx; 89d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 90ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 91d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 92ccedf6b0Sjeremylt } 93ccedf6b0Sjeremylt } 94241a4b83SYohann 95ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 96ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 97ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 985c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 99053543deSYohann Dudouit inline __device__ void writeDofsOffset1d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 100ab213215SJeremy L Thompson if (data.tidx < P1d) { 1014d537eeaSYohann const CeedInt node = data.tidx; 102ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d]; 103ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 1045c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 105241a4b83SYohann } 106241a4b83SYohann } 107241a4b83SYohann 108ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 109ab213215SJeremy L Thompson // E-vector -> L-vector, strided 110ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 111d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 112d80fc06aSjeremylt inline __device__ void writeDofsStrided1d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 113ab213215SJeremy L Thompson if (data.tidx < P1d) { 114ccedf6b0Sjeremylt const CeedInt node = data.tidx; 115d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 116ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 117d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 118ccedf6b0Sjeremylt } 119ccedf6b0Sjeremylt } 120ccedf6b0Sjeremylt 121ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 122ab213215SJeremy L Thompson // 1D tensor contraction x 123ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 124241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 125ab213215SJeremy L Thompson inline __device__ void ContractX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 126241a4b83SYohann data.slice[data.tidx] = *U; 127241a4b83SYohann __syncthreads(); 128241a4b83SYohann *V = 0.0; 12918d499f1SYohann if (data.tidx < Q1d) 130ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 131ab213215SJeremy L Thompson *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction 132241a4b83SYohann __syncthreads(); 133241a4b83SYohann } 134241a4b83SYohann 135ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 136ab213215SJeremy L Thompson // 1D transpose tensor contraction x 137ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 138241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 139ab213215SJeremy L Thompson inline __device__ void ContractTransposeX1d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 140241a4b83SYohann data.slice[data.tidx] = *U; 141241a4b83SYohann __syncthreads(); 142241a4b83SYohann *V = 0.0; 14318d499f1SYohann if (data.tidx < P1d) 144ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 145ab213215SJeremy L Thompson *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction 146241a4b83SYohann __syncthreads(); 147241a4b83SYohann } 148241a4b83SYohann 149ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 150ab213215SJeremy L Thompson // 1D interpolate to quadrature points 151ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 152241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 153ab213215SJeremy L Thompson inline __device__ void interp1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 154ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 155241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 156241a4b83SYohann } 157241a4b83SYohann 158ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 159ab213215SJeremy L Thompson // 1D interpolate transpose 160ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 161241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 162ab213215SJeremy L Thompson inline __device__ void interpTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 163ab213215SJeremy L Thompson for (CeedInt comp=0; comp<NCOMP; comp++) 164241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_V + comp); 165241a4b83SYohann } 166241a4b83SYohann 167ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 168ab213215SJeremy L Thompson // 1D derivatives at quadrature points 169ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 170241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 171ab213215SJeremy L Thompson inline __device__ void grad1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 172ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 173241a4b83SYohann ContractX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 174241a4b83SYohann } 175241a4b83SYohann 176ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 177ab213215SJeremy L Thompson // 1D derivatives transpose 178ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 179241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 180ab213215SJeremy L Thompson inline __device__ void gradTranspose1d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 181ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; comp++) 182241a4b83SYohann ContractTransposeX1d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_V + comp); 183241a4b83SYohann } 184241a4b83SYohann 185ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 186241a4b83SYohann // 2D 187ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 188ab213215SJeremy L Thompson 189ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 190ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 191ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1925c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 193053543deSYohann Dudouit inline __device__ void readDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 194ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 1954d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 196ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 197ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 1985c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 199241a4b83SYohann } 200241a4b83SYohann } 201920dcdc4Sjeremylt 202ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 203ab213215SJeremy L Thompson // L-vector -> E-vector, strided 204ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 205d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 206053543deSYohann Dudouit inline __device__ void readDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 207ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 208ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 209d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 210ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 211d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 212ccedf6b0Sjeremylt } 213ccedf6b0Sjeremylt } 214241a4b83SYohann 215ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 216ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 217ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 2185c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 219053543deSYohann Dudouit inline __device__ void writeDofsOffset2d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 220ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 2214d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d; 222ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d]; 223ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 2245c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 225241a4b83SYohann } 226241a4b83SYohann } 227241a4b83SYohann 228ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 229ab213215SJeremy L Thompson // E-vector -> L-vector, strided 230ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 231d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 232d80fc06aSjeremylt inline __device__ void writeDofsStrided2d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 233ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) { 234ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d; 235d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 236ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 237d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 238ccedf6b0Sjeremylt } 239ccedf6b0Sjeremylt } 240ccedf6b0Sjeremylt 241ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 242ab213215SJeremy L Thompson // 2D tensor contraction x 243ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 244241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 245ab213215SJeremy L Thompson inline __device__ void ContractX2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 24618d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 247241a4b83SYohann __syncthreads(); 248241a4b83SYohann *V = 0.0; 24918d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 250ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 25118d499f1SYohann *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 252241a4b83SYohann __syncthreads(); 253241a4b83SYohann } 254241a4b83SYohann 255ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 256ab213215SJeremy L Thompson // 2D tensor contract y 257ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 258241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 259ab213215SJeremy L Thompson inline __device__ void ContractY2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 26018d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 261241a4b83SYohann __syncthreads(); 262241a4b83SYohann *V = 0.0; 26318d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 264ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 26518d499f1SYohann *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 266241a4b83SYohann __syncthreads(); 267241a4b83SYohann } 268241a4b83SYohann 269ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 270ab213215SJeremy L Thompson // 2D transpose tensor contract y 271ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 272241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 273ab213215SJeremy L Thompson inline __device__ void ContractYTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 27418d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 275241a4b83SYohann __syncthreads(); 276241a4b83SYohann *V = 0.0; 27718d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 278ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 27918d499f1SYohann *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*T1d]; // Contract y direction 280241a4b83SYohann __syncthreads(); 281241a4b83SYohann } 282241a4b83SYohann 283ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 284ab213215SJeremy L Thompson // 2D transpose tensor contract x 285ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 286241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 287ab213215SJeremy L Thompson inline __device__ void ContractXTranspose2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 28818d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 289241a4b83SYohann __syncthreads(); 290241a4b83SYohann *V = 0.0; 29118d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 292ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 29318d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 294241a4b83SYohann __syncthreads(); 295241a4b83SYohann } 296241a4b83SYohann 297ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 298ab213215SJeremy L Thompson // 2D transpose tensor contract and add x 299ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 300241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 301ab213215SJeremy L Thompson inline __device__ void ContractXTransposeAdd2d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 30218d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = *U; 303241a4b83SYohann __syncthreads(); 30418d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 305ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 30618d499f1SYohann *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*T1d]; // Contract x direction 307241a4b83SYohann __syncthreads(); 308241a4b83SYohann } 309241a4b83SYohann 310ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 311ab213215SJeremy L Thompson // 2D interpolate to quadrature points 312ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 313241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 314ab213215SJeremy L Thompson inline __device__ void interp2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 315241a4b83SYohann CeedScalar r_t[1]; 316ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 317241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 318241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 319241a4b83SYohann } 320241a4b83SYohann } 321241a4b83SYohann 322ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 323ab213215SJeremy L Thompson // 2D interpolate transpose 324ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 325241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 326ab213215SJeremy L Thompson inline __device__ void interpTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 327241a4b83SYohann CeedScalar r_t[1]; 328ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 329241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 330241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 331241a4b83SYohann } 332241a4b83SYohann } 333241a4b83SYohann 334ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 335ab213215SJeremy L Thompson // 2D derivatives at quadrature points 336ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 337241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 338ab213215SJeremy L Thompson inline __device__ void grad2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 339241a4b83SYohann CeedScalar r_t[1]; 340ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 341241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_G, r_t); 342241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp + 0*NCOMP); 343241a4b83SYohann ContractX2d<NCOMP, P1d, Q1d>(data, r_U + comp, c_B, r_t); 344241a4b83SYohann ContractY2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp + 1*NCOMP); 345241a4b83SYohann } 346241a4b83SYohann } 347241a4b83SYohann 348ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 349ab213215SJeremy L Thompson // 2D derivatives transpose 350ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 351241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 352ab213215SJeremy L Thompson inline __device__ void gradTranspose2d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 353241a4b83SYohann CeedScalar r_t[1]; 354ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 355241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 0*NCOMP, c_B, r_t); 356241a4b83SYohann ContractXTranspose2d<NCOMP, P1d, Q1d>(data, r_t, c_G, r_V + comp); 357241a4b83SYohann ContractYTranspose2d<NCOMP, P1d, Q1d>(data, r_U + comp + 1*NCOMP, c_G, r_t); 358241a4b83SYohann ContractXTransposeAdd2d<NCOMP, P1d, Q1d>(data, r_t, c_B, r_V + comp); 359241a4b83SYohann } 360241a4b83SYohann } 361241a4b83SYohann 362ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 363241a4b83SYohann // 3D 364ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 365ab213215SJeremy L Thompson 366ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 367ab213215SJeremy L Thompson // L-vector -> E-vector, offsets provided 368ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3695c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 370053543deSYohann Dudouit inline __device__ void readDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 371ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 372241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 3734d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 374ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 375ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 3765c7b696cSjeremylt r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 377241a4b83SYohann } 378241a4b83SYohann } 379920dcdc4Sjeremylt 380ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 381ab213215SJeremy L Thompson // L-vector -> E-vector, strided 382ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 383d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 384053543deSYohann Dudouit inline __device__ void readDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 385ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 386ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 387ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 388d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 389ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 390d80fc06aSjeremylt r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 391ccedf6b0Sjeremylt } 392ccedf6b0Sjeremylt } 393241a4b83SYohann 394ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 395ab213215SJeremy L Thompson // E-vector -> Q-vector, offests provided 396ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 3975c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int Q1d> 398053543deSYohann Dudouit inline __device__ void readSliceQuadsOffset3d(BackendData &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 39918d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 400ac421f39SYohann const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 401920dcdc4Sjeremylt const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 402ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4035c7b696cSjeremylt r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 404ac421f39SYohann } 40518d499f1SYohann } 406ac421f39SYohann 407ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 408ab213215SJeremy L Thompson // E-vector -> Q-vector, strided 409ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 410d80fc06aSjeremylt template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 411053543deSYohann Dudouit inline __device__ void readSliceQuadsStrided3d(BackendData &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 41218d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 413920dcdc4Sjeremylt const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; 414d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 415ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 416d80fc06aSjeremylt r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 417920dcdc4Sjeremylt } 41818d499f1SYohann } 419920dcdc4Sjeremylt 420ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 421ab213215SJeremy L Thompson // E-vector -> L-vector, offsets provided 422ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 4235c7b696cSjeremylt template <int NCOMP, int COMPSTRIDE, int P1d> 424053543deSYohann Dudouit inline __device__ void writeDofsOffset3d(BackendData &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { 425ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 426241a4b83SYohann for (CeedInt z = 0; z < P1d; ++z) { 4274d537eeaSYohann const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 428ccedf6b0Sjeremylt const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 429ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 4305c7b696cSjeremylt atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 431241a4b83SYohann } 432241a4b83SYohann } 433241a4b83SYohann 434ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 435ab213215SJeremy L Thompson // E-vector -> L-vector, strided 436ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 437d80fc06aSjeremylt template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 4382f4ca718Sjeremylt inline __device__ void writeDofsStrided3d(BackendData &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 439ab213215SJeremy L Thompson if (data.tidx < P1d && data.tidy < P1d) 440ccedf6b0Sjeremylt for (CeedInt z = 0; z < P1d; ++z) { 441ccedf6b0Sjeremylt const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; 442d80fc06aSjeremylt const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 443ab213215SJeremy L Thompson for (CeedInt comp = 0; comp < NCOMP; ++comp) 444d80fc06aSjeremylt d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 445ccedf6b0Sjeremylt } 446ccedf6b0Sjeremylt } 447ccedf6b0Sjeremylt 448ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 449ab213215SJeremy L Thompson // 3D tensor contract x 450ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 451241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 452ab213215SJeremy L Thompson inline __device__ void ContractX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 453ac421f39SYohann CeedScalar r_B[P1d]; 454ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 455ac421f39SYohann r_B[i] = B[i + data.tidx*P1d]; 456ab213215SJeremy L Thompson 457ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 45818d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 459241a4b83SYohann __syncthreads(); 460241a4b83SYohann V[k] = 0.0; 46118d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 462ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 46318d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 464241a4b83SYohann __syncthreads(); 465241a4b83SYohann } 466241a4b83SYohann } 467241a4b83SYohann 468ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 469ab213215SJeremy L Thompson // 3D tensor contract y 470ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 471241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 472ab213215SJeremy L Thompson inline __device__ void ContractY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 473ac421f39SYohann CeedScalar r_B[P1d]; 474ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 475ac421f39SYohann r_B[i] = B[i + data.tidy*P1d]; 476ab213215SJeremy L Thompson 477ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 47818d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 479241a4b83SYohann __syncthreads(); 480241a4b83SYohann V[k] = 0.0; 48118d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 482ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 48318d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 484241a4b83SYohann __syncthreads(); 485241a4b83SYohann } 486241a4b83SYohann } 487241a4b83SYohann 488ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 489ab213215SJeremy L Thompson // 3D tensor contract z 490ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 491241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 492ab213215SJeremy L Thompson inline __device__ void ContractZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 493ac421f39SYohann for (CeedInt k = 0; k < Q1d; ++k) { 494241a4b83SYohann V[k] = 0.0; 49518d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 496ab213215SJeremy L Thompson for (CeedInt i = 0; i < P1d; ++i) 497ab213215SJeremy L Thompson V[k] += B[i + k*P1d] * U[i]; // Contract z direction 498241a4b83SYohann } 499241a4b83SYohann } 500241a4b83SYohann 501ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 502ab213215SJeremy L Thompson // 3D transpose tensor contract z 503ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 504241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 505ab213215SJeremy L Thompson inline __device__ void ContractTransposeZ3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 50618d499f1SYohann for (CeedInt k = 0; k < P1d; ++k) { 507241a4b83SYohann V[k] = 0.0; 50818d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) 509ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 510ab213215SJeremy L Thompson V[k] += B[k + i*P1d] * U[i]; // Contract z direction 511241a4b83SYohann } 512241a4b83SYohann } 513241a4b83SYohann 514ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 515ab213215SJeremy L Thompson // 3D transpose tensor contract y 516ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 517241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 518ab213215SJeremy L Thompson inline __device__ void ContractTransposeY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 519ac421f39SYohann CeedScalar r_B[Q1d]; 520ac421f39SYohann for (CeedInt i = 0; i < Q1d; ++i) 521ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 522ab213215SJeremy L Thompson 523ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 52418d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 525241a4b83SYohann __syncthreads(); 526241a4b83SYohann V[k] = 0.0; 52718d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 528ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 52918d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 530ac421f39SYohann __syncthreads(); 531ac421f39SYohann } 532ac421f39SYohann } 533ac421f39SYohann 534ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 535ab213215SJeremy L Thompson // 3D transpose tensor contract add y 536ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 537ac421f39SYohann template <int NCOMP, int P1d, int Q1d> 538ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddY3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 539ac421f39SYohann CeedScalar r_B[Q1d]; 540ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 541ac421f39SYohann r_B[i] = B[data.tidy + i*P1d]; 542ab213215SJeremy L Thompson 543ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 54418d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 545ac421f39SYohann __syncthreads(); 54618d499f1SYohann if (data.tidx < Q1d && data.tidy < P1d) 547ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 54818d499f1SYohann V[k] += r_B[i] * data.slice[data.tidx + i*T1d]; // Contract y direction 549241a4b83SYohann __syncthreads(); 550241a4b83SYohann } 551241a4b83SYohann } 552241a4b83SYohann 553ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 554ab213215SJeremy L Thompson // 3D transpose tensor contract x 555ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 556241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 557ab213215SJeremy L Thompson inline __device__ void ContractTransposeX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 558ac421f39SYohann CeedScalar r_B[Q1d]; 559ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 560ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 561ab213215SJeremy L Thompson 562ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 56318d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 564241a4b83SYohann __syncthreads(); 565241a4b83SYohann V[k] = 0.0; 56618d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 567ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 56818d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 569241a4b83SYohann __syncthreads(); 570241a4b83SYohann } 571241a4b83SYohann } 572241a4b83SYohann 573ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 574ab213215SJeremy L Thompson // 3D transpose tensor contract add x 575ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 576241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 577ab213215SJeremy L Thompson inline __device__ void ContractTransposeAddX3d(BackendData &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 578ac421f39SYohann CeedScalar r_B[Q1d]; 579ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 580ac421f39SYohann r_B[i] = B[data.tidx + i*P1d]; 581ab213215SJeremy L Thompson 582ac421f39SYohann for (CeedInt k = 0; k < P1d; ++k) { 58318d499f1SYohann data.slice[data.tidx+data.tidy*T1d] = U[k]; 584241a4b83SYohann __syncthreads(); 58518d499f1SYohann if (data.tidx < P1d && data.tidy < P1d) 586ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 58718d499f1SYohann V[k] += r_B[i] * data.slice[i + data.tidy*T1d]; // Contract x direction 588241a4b83SYohann __syncthreads(); 589241a4b83SYohann } 590241a4b83SYohann } 591241a4b83SYohann 592ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 593ab213215SJeremy L Thompson // 3D interpolate to quadrature points 594ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 595241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 596ab213215SJeremy L Thompson inline __device__ void interp3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 59718d499f1SYohann CeedScalar r_t1[T1d]; 59818d499f1SYohann CeedScalar r_t2[T1d]; 599ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 600241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 601241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 602241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d); 603241a4b83SYohann } 604241a4b83SYohann } 605241a4b83SYohann 606ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 607ab213215SJeremy L Thompson // 3D interpolate transpose 608ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 609241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 610ab213215SJeremy L Thompson inline __device__ void interpTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 61118d499f1SYohann CeedScalar r_t1[T1d]; 61218d499f1SYohann CeedScalar r_t2[T1d]; 613ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 614241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d, c_B, r_t1); 615241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 616241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 617241a4b83SYohann } 618241a4b83SYohann } 619241a4b83SYohann 620ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 621ab213215SJeremy L Thompson // 3D derivatives at quadrature points 622ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 623241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 624ab213215SJeremy L Thompson inline __device__ void grad3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 62518d499f1SYohann CeedScalar r_t1[T1d]; 62618d499f1SYohann CeedScalar r_t2[T1d]; 627ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 628241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_G, r_t1); 629241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 630241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); 631241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 632241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 633241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); 634241a4b83SYohann ContractX3d<NCOMP, P1d, Q1d>(data, r_U + comp*P1d, c_B, r_t1); 635241a4b83SYohann ContractY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 636241a4b83SYohann ContractZ3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); 637241a4b83SYohann } 638241a4b83SYohann } 639241a4b83SYohann 640ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 641ab213215SJeremy L Thompson // 3D derivatives transpose 642ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 643241a4b83SYohann template <int NCOMP, int P1d, int Q1d> 644ab213215SJeremy L Thompson inline __device__ void gradTranspose3d(BackendData &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 64518d499f1SYohann CeedScalar r_t1[T1d]; 64618d499f1SYohann CeedScalar r_t2[T1d]; 647ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; comp++) { 648241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); 649241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 650241a4b83SYohann ContractTransposeX3d<NCOMP, P1d, Q1d>(data, r_t2, c_G, r_V + comp*P1d); 651241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); 652241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_G, r_t2); 653241a4b83SYohann ContractTransposeAddX3d<NCOMP,P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 654241a4b83SYohann ContractTransposeZ3d<NCOMP, P1d, Q1d>(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); 655241a4b83SYohann ContractTransposeY3d<NCOMP, P1d, Q1d>(data, r_t1, c_B, r_t2); 656241a4b83SYohann ContractTransposeAddX3d<NCOMP, P1d, Q1d>(data, r_t2, c_B, r_V + comp*P1d); 657241a4b83SYohann } 658241a4b83SYohann } 659241a4b83SYohann 660ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 661ab213215SJeremy L Thompson // 3D collocated derivatives computation 662ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 663ac421f39SYohann template <int NCOMP, int Q1d> 664ab213215SJeremy L Thompson inline __device__ void gradCollo3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 66518d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 666ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 66718d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[q + comp*Q1d]; 668ac421f39SYohann __syncthreads(); 669ac421f39SYohann // X derivative 670ac421f39SYohann r_V[comp+0*NCOMP] = 0.0; 671ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 67218d499f1SYohann r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 673ac421f39SYohann // Y derivative 674ac421f39SYohann r_V[comp+1*NCOMP] = 0.0; 675ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 67618d499f1SYohann r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 677ac421f39SYohann // Z derivative 678ac421f39SYohann r_V[comp+2*NCOMP] = 0.0; 679ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 680ab213215SJeremy L Thompson r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 681ac421f39SYohann __syncthreads(); 682ac421f39SYohann } 683ac421f39SYohann } 68418d499f1SYohann } 685ac421f39SYohann 686ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 687ab213215SJeremy L Thompson // 3D collocated derivatives transpose 688ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 689ac421f39SYohann template <int NCOMP, int Q1d> 690ab213215SJeremy L Thompson inline __device__ void gradColloTranspose3d(BackendData &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 69118d499f1SYohann if (data.tidx < Q1d && data.tidy < Q1d) { 692ac421f39SYohann for (CeedInt comp = 0; comp < NCOMP; ++comp) { 693ac421f39SYohann // X derivative 69418d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 0*NCOMP]; 695ac421f39SYohann __syncthreads(); 696ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 69718d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*T1d]; // Contract x direction (X derivative) 698ac421f39SYohann __syncthreads(); 699ac421f39SYohann // Y derivative 70018d499f1SYohann data.slice[data.tidx + data.tidy*T1d] = r_U[comp + 1*NCOMP]; 701ac421f39SYohann __syncthreads(); 702ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 70318d499f1SYohann r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*T1d]; // Contract y direction (Y derivative) 704ac421f39SYohann __syncthreads(); 705ac421f39SYohann // Z derivative 706ab213215SJeremy L Thompson for (CeedInt i = 0; i < Q1d; ++i) 707ac421f39SYohann r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 708ac421f39SYohann } 709ac421f39SYohann } 71018d499f1SYohann } 711ac421f39SYohann 712ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 713ab213215SJeremy L Thompson // 1D quadrature weights 714ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 715241a4b83SYohann template <int Q1d> 716053543deSYohann Dudouit inline __device__ void weight1d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 71718d499f1SYohann *w = (data.tidx < Q1d) ? qweight1d[data.tidx] : 0.0; 718241a4b83SYohann } 719241a4b83SYohann 720ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 721ab213215SJeremy L Thompson // 2D quadrature weights 722ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 723241a4b83SYohann template <int Q1d> 724053543deSYohann Dudouit inline __device__ void weight2d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 72518d499f1SYohann *w = (data.tidx < Q1d && data.tidy < Q1d) ? 72618d499f1SYohann qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 727241a4b83SYohann } 728241a4b83SYohann 729ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 730ab213215SJeremy L Thompson // 3D quadrature weights 731ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 732241a4b83SYohann template <int Q1d> 733053543deSYohann Dudouit inline __device__ void weight3d(BackendData &data, const CeedScalar *__restrict__ qweight1d, CeedScalar *w) { 73418d499f1SYohann const bool quad = (data.tidx < Q1d && data.tidy < Q1d); 73518d499f1SYohann const CeedScalar pw = quad ? qweight1d[data.tidx]*qweight1d[data.tidy] : 0.0; 736ac421f39SYohann for (CeedInt z = 0; z < Q1d; ++z) 73718d499f1SYohann w[z] = quad ? pw*qweight1d[z] : 0.0; 738241a4b83SYohann } 739241a4b83SYohann 740241a4b83SYohann ); 741ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 742ab213215SJeremy L Thompson // Build singe operator kernel 743ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 744241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 745241a4b83SYohann 746241a4b83SYohann using std::ostringstream; 747241a4b83SYohann using std::string; 748241a4b83SYohann int ierr; 749241a4b83SYohann bool setupdone; 750e15f9bd0SJeremy L Thompson ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChkBackend(ierr); 751e15f9bd0SJeremy L Thompson if (setupdone) return CEED_ERROR_SUCCESS; 752241a4b83SYohann Ceed ceed; 753e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 754241a4b83SYohann CeedOperator_Cuda_gen *data; 755e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 756241a4b83SYohann CeedQFunction qf; 757241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 758e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 759e15f9bd0SJeremy L Thompson ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 760e79b91d9SJeremy L Thompson CeedSize lsize; 76188db6d3bSJeremy L Thompson CeedInt Q, P1d = 0, Q1d = 0, numelements, elemsize, numinputfields, 7621d47fde2SJeremy L Thompson numoutputfields, ncomp, dim = 1; 763e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 7641d47fde2SJeremy L Thompson Q1d = Q; 765e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumElements(op, &numelements); CeedChkBackend(ierr); 766241a4b83SYohann CeedOperatorField *opinputfields, *opoutputfields; 7677e7773b5SJeremy L Thompson ierr = CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields); 768e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 769241a4b83SYohann CeedQFunctionField *qfinputfields, *qfoutputfields; 7707e7773b5SJeremy L Thompson ierr = CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields); 771e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 772241a4b83SYohann CeedEvalMode emode; 773241a4b83SYohann CeedBasis basis; 774241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 775241a4b83SYohann CeedElemRestriction Erestrict; 776461525f5SNatalie Beams CeedElemRestriction_Cuda *restr_data; 777241a4b83SYohann 7780b454692Sjeremylt // Check for restriction only identity operator 7790b454692Sjeremylt bool is_identity_qf; 7800b454692Sjeremylt ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 7810b454692Sjeremylt if (is_identity_qf) { 7820b454692Sjeremylt CeedEvalMode emodein, emodeout; 7830b454692Sjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[0], &emodein); CeedChkBackend(ierr); 7840b454692Sjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[0], &emodeout); CeedChkBackend(ierr); 7850b454692Sjeremylt if (emodein == CEED_EVAL_NONE && emodeout == CEED_EVAL_NONE) 7860b454692Sjeremylt // LCOV_EXCL_START 7870b454692Sjeremylt return CeedError(ceed, CEED_ERROR_BACKEND, 7880b454692Sjeremylt "Backend does not implement restriction only identity operators"); 7890b454692Sjeremylt // LCOV_EXCL_STOP 7900b454692Sjeremylt } 7910b454692Sjeremylt 792241a4b83SYohann ostringstream code; 793241a4b83SYohann string devFunctions(deviceFunctions); 794241a4b83SYohann 795f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 796f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 797f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 79888db6d3bSJeremy L Thompson ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr); 7990d0321e0SJeremy L Thompson ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr); 80080a9ef05SNatalie Beams if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){ 801f1a13f77SYohann Dudouit code << atomicAdd; 802f1a13f77SYohann Dudouit } 803f1a13f77SYohann Dudouit 804241a4b83SYohann code << devFunctions; 805241a4b83SYohann 806241a4b83SYohann string qFunction(qf_data->qFunctionSource); 807c3c443faSYohann Dudouit string qFunctionName(qf_data->qFunctionName); 808c3c443faSYohann Dudouit string oper; 80914cce66cSYohann Dudouit oper = "CeedKernel_Cuda_gen_" + qFunctionName; 8104d537eeaSYohann 8111da99368SJeremy L Thompson // Find dim and Q1d 812*dc64899eSYohann bool useCollograd = false; 813*dc64899eSYohann // Only use collocated gradient algorithm when we actually compute a gradient. 814*dc64899eSYohann if ( dim == 3 ) { 815*dc64899eSYohann for (CeedInt i = 0; i < numinputfields; i++) { 816*dc64899eSYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 817*dc64899eSYohann if (emode == CEED_EVAL_GRAD) { 818*dc64899eSYohann useCollograd = true; 819*dc64899eSYohann } 820*dc64899eSYohann } 821*dc64899eSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 822*dc64899eSYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 823*dc64899eSYohann if (emode == CEED_EVAL_GRAD) { 824*dc64899eSYohann useCollograd = true; 825*dc64899eSYohann } 826*dc64899eSYohann } 827*dc64899eSYohann } 82818d499f1SYohann data->maxP1d = 0; 8291da99368SJeremy L Thompson for (CeedInt i = 0; i < numinputfields; i++) { 830e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 8311da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 832e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 8330f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 834e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8351da99368SJeremy L Thompson 8361da99368SJeremy L Thompson // Check for collocated gradient 837437930d1SJeremy L Thompson useCollograd = useCollograd && basis_data->d_collo_grad_1d; 8381da99368SJeremy L Thompson 8391da99368SJeremy L Thompson // Collect dim and Q1d 840e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 8411da99368SJeremy L Thompson bool isTensor; 842e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 8431da99368SJeremy L Thompson if (isTensor) { 844e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 845e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 84618d499f1SYohann if (P1d>data->maxP1d) data->maxP1d = P1d; 8471da99368SJeremy L Thompson } else { 848e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 849e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 850e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8511da99368SJeremy L Thompson } 8521da99368SJeremy L Thompson } 8531da99368SJeremy L Thompson } 8541da99368SJeremy L Thompson // Check output bases for Q1d, dim as well 855*dc64899eSYohann // The only input basis might be CEED_BASIS_COLLOCATED 8561da99368SJeremy L Thompson for (CeedInt i = 0; i < numoutputfields; i++) { 857e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 8580f54b25eSjeremylt 8591da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 860e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 8610f54b25eSjeremylt ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 862e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8630f54b25eSjeremylt 8641da99368SJeremy L Thompson // Collect dim and Q1d 865e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 8661da99368SJeremy L Thompson bool isTensor; 867e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 8681da99368SJeremy L Thompson if (isTensor) { 869e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 8701da99368SJeremy L Thompson } else { 871e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 872e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 873e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 8741da99368SJeremy L Thompson } 8750f54b25eSjeremylt 8760f54b25eSjeremylt // Check for collocated gradient 877437930d1SJeremy L Thompson useCollograd = useCollograd && basis_data->d_collo_grad_1d; 8781da99368SJeremy L Thompson } 8791da99368SJeremy L Thompson } 8801da99368SJeremy L Thompson data->dim = dim; 8811da99368SJeremy L Thompson data->Q1d = Q1d; 8821da99368SJeremy L Thompson 8831da99368SJeremy L Thompson // Define CEED_Q_VLA 88464d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 8851da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA 1\n\n"; 8861da99368SJeremy L Thompson } else { 8871da99368SJeremy L Thompson code << "\n#define CEED_Q_VLA "<<Q1d<<"\n\n"; 8881da99368SJeremy L Thompson } 8891da99368SJeremy L Thompson 890241a4b83SYohann code << qFunction; 891241a4b83SYohann 892241a4b83SYohann // Setup 893818e0025SJeremy L Thompson code << "\n// -----------------------------------------------------------------------------\n"; 894c3c443faSYohann Dudouit code << "\nextern \"C\" __global__ void "<<oper<<"(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n"; 895241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 896241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 897e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 8981da99368SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 899241a4b83SYohann code << " const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n"; 900241a4b83SYohann } 901241a4b83SYohann } 902241a4b83SYohann 903241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 904241a4b83SYohann code << " CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n"; 905241a4b83SYohann } 9061da99368SJeremy L Thompson 907241a4b83SYohann code << " const CeedInt Dim = "<<dim<<";\n"; 908241a4b83SYohann code << " const CeedInt Q1d = "<<Q1d<<";\n"; 9091da99368SJeremy L Thompson 910241a4b83SYohann code << " extern __shared__ CeedScalar slice[];\n"; 911241a4b83SYohann code << " BackendData data;\n"; 912241a4b83SYohann code << " data.tidx = threadIdx.x;\n"; 913241a4b83SYohann code << " data.tidy = threadIdx.y;\n"; 914241a4b83SYohann code << " data.tidz = threadIdx.z;\n"; 915241a4b83SYohann code << " data.tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 91618d499f1SYohann code << " data.slice = slice+data.tidz*T1d"<<(dim>1?"*T1d":"")<<";\n"; 917920dcdc4Sjeremylt 918818e0025SJeremy L Thompson code << "\n // -- Input field constants and basis data --\n"; 919ac421f39SYohann //Initialize constants, and matrices B and G 920241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 921818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 922241a4b83SYohann // Get elemsize, emode, ncomp 923241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 924e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 925241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 926e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 927241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 928e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9294d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 930e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 931920dcdc4Sjeremylt 932920dcdc4Sjeremylt // Set field constants 933920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT) { 934e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 935920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 936e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 937920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<P1d<<";\n"; 938920dcdc4Sjeremylt } else { 939920dcdc4Sjeremylt code << " const CeedInt P_in_"<<i<<" = "<<Q1d<<";\n"; 940920dcdc4Sjeremylt } 941920dcdc4Sjeremylt code << " const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n"; 942920dcdc4Sjeremylt } 943920dcdc4Sjeremylt 944920dcdc4Sjeremylt // Load basis data 945920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 946241a4b83SYohann switch (emode) { 947241a4b83SYohann case CEED_EVAL_NONE: 948241a4b83SYohann break; 949241a4b83SYohann case CEED_EVAL_INTERP: 950e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 951437930d1SJeremy L Thompson data->B.in[i] = basis_data->d_interp_1d; 95280a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 953241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 954241a4b83SYohann break; 955241a4b83SYohann case CEED_EVAL_GRAD: 956e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 957437930d1SJeremy L Thompson data->B.in[i] = basis_data->d_interp_1d; 95880a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 959241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n"; 96064d3f0c0Sjeremylt if (useCollograd) { 961437930d1SJeremy L Thompson data->G.in[i] = basis_data->d_collo_grad_1d; 96280a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q1d*Q1d<<"];\n"; 963ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 964ac421f39SYohann } else { 965437930d1SJeremy L Thompson data->G.in[i] = basis_data->d_grad_1d; 96680a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n"; 967241a4b83SYohann code << " loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n"; 968ac421f39SYohann } 969241a4b83SYohann break; 970241a4b83SYohann case CEED_EVAL_WEIGHT: 971241a4b83SYohann break; // No action 972241a4b83SYohann case CEED_EVAL_DIV: 973241a4b83SYohann break; // TODO: Not implemented 974241a4b83SYohann case CEED_EVAL_CURL: 975241a4b83SYohann break; // TODO: Not implemented 976241a4b83SYohann } 977241a4b83SYohann } 978920dcdc4Sjeremylt 979818e0025SJeremy L Thompson code << "\n // -- Output field constants and basis data --\n"; 980241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 981818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 982241a4b83SYohann // Get elemsize, emode, ncomp 983241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 984e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 985241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 986e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 987241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 988e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 9894d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 990e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 991920dcdc4Sjeremylt 992920dcdc4Sjeremylt // Set field constants 993e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChkBackend(ierr); 994920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 995e15f9bd0SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChkBackend(ierr); 996241a4b83SYohann code << " const CeedInt P_out_"<<i<<" = "<<P1d<<";\n"; 997920dcdc4Sjeremylt } else { 998920dcdc4Sjeremylt code << " const CeedInt P_out_"<<i<<" = "<<Q1d<<";\n"; 999920dcdc4Sjeremylt } 1000920dcdc4Sjeremylt code << " const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n"; 1001920dcdc4Sjeremylt 1002920dcdc4Sjeremylt // Load basis data 1003920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1004920dcdc4Sjeremylt switch (emode) { 1005920dcdc4Sjeremylt case CEED_EVAL_NONE: 1006920dcdc4Sjeremylt break; // No action 1007920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1008e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1009437930d1SJeremy L Thompson data->B.out[i] = basis_data->d_interp_1d; 101080a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1011241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 1012241a4b83SYohann break; 1013241a4b83SYohann case CEED_EVAL_GRAD: 1014e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1015437930d1SJeremy L Thompson data->B.out[i] = basis_data->d_interp_1d; 101680a9ef05SNatalie Beams code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1017241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n"; 101864d3f0c0Sjeremylt if (useCollograd) { 1019437930d1SJeremy L Thompson data->G.out[i] = basis_data->d_collo_grad_1d; 102080a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q1d*Q1d<<"];\n"; 1021ac421f39SYohann code << " loadMatrix<Q1d,Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1022ac421f39SYohann } else { 1023437930d1SJeremy L Thompson data->G.out[i] = basis_data->d_grad_1d; 102480a9ef05SNatalie Beams code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n"; 1025241a4b83SYohann code << " loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n"; 1026ac421f39SYohann } 1027241a4b83SYohann break; 1028e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1029241a4b83SYohann case CEED_EVAL_WEIGHT: { 1030241a4b83SYohann Ceed ceed; 1031e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1032e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1033241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1034241a4b83SYohann break; // Should not occur 1035241a4b83SYohann } 1036241a4b83SYohann case CEED_EVAL_DIV: 1037241a4b83SYohann break; // TODO: Not implemented 1038241a4b83SYohann case CEED_EVAL_CURL: 1039241a4b83SYohann break; // TODO: Not implemented 1040e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1041241a4b83SYohann } 1042241a4b83SYohann } 1043818e0025SJeremy L Thompson code << "\n // -- Element loop --\n"; 1044ac421f39SYohann code << " __syncthreads();\n"; 1045241a4b83SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n"; 1046241a4b83SYohann // Input basis apply if needed 1047ac421f39SYohann // Generate the correct eval mode code for each input 1048818e0025SJeremy L Thompson code << " // -- Input field restrictions and basis actions --\n"; 1049241a4b83SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1050818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1051241a4b83SYohann // Get elemsize, emode, ncomp 1052241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); 1053e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1054241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1055e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1056241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1057e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10584d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1059e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1060920dcdc4Sjeremylt 1061920dcdc4Sjeremylt // Restriction 1062920dcdc4Sjeremylt if (emode != CEED_EVAL_WEIGHT && 106364d3f0c0Sjeremylt !((emode == CEED_EVAL_NONE) && useCollograd)) { 1064241a4b83SYohann code << " CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n"; 10659a2291e3SJeremy L Thompson 10669a2291e3SJeremy L Thompson bool isStrided; 1067e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 10689a2291e3SJeremy L Thompson if (!isStrided) { 10695c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1070e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 10715c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 10725c7b696cSjeremylt CeedInt compstride; 1073e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 10745c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1075e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 10769a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 10775c7b696cSjeremylt code << " readDofsOffset"<<dim<<"d<ncomp_in_"<<i<<", "<<compstride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n"; 1078ccedf6b0Sjeremylt } else { 1079920dcdc4Sjeremylt bool backendstrides; 1080fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1081e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 108213585805SJeremy L Thompson CeedInt nelem; 108313585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1084e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 108513585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1086920dcdc4Sjeremylt if (!backendstrides) { 1087920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1088e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1089ccedf6b0Sjeremylt } 1090920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1091d80fc06aSjeremylt code << " readDofsStrided"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u"<<i<<", r_u"<<i<<");\n"; 1092920dcdc4Sjeremylt } 1093920dcdc4Sjeremylt } 1094920dcdc4Sjeremylt 1095920dcdc4Sjeremylt // Basis action 1096920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1097920dcdc4Sjeremylt switch (emode) { 1098920dcdc4Sjeremylt case CEED_EVAL_NONE: 109964d3f0c0Sjeremylt if (!useCollograd) { 1100920dcdc4Sjeremylt code << " CeedScalar* r_t"<<i<<" = r_u"<<i<<";\n"; 1101920dcdc4Sjeremylt } 1102920dcdc4Sjeremylt break; 1103920dcdc4Sjeremylt case CEED_EVAL_INTERP: 1104241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1105241a4b83SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1106241a4b83SYohann break; 1107241a4b83SYohann case CEED_EVAL_GRAD: 110864d3f0c0Sjeremylt if (useCollograd) { 1109ac421f39SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n"; 1110ac421f39SYohann code << " interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n"; 1111ac421f39SYohann } else { 1112241a4b83SYohann code << " CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n"; 1113241a4b83SYohann code << " grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n"; 1114ac421f39SYohann } 1115241a4b83SYohann break; 1116241a4b83SYohann case CEED_EVAL_WEIGHT: 1117241a4b83SYohann code << " CeedScalar r_t"<<i<<"[Q1d];\n"; 1118e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChkBackend(ierr); 1119e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1120437930d1SJeremy L Thompson data->W = basis_data->d_q_weight_1d; 1121241a4b83SYohann code << " weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n"; 1122241a4b83SYohann break; // No action 1123241a4b83SYohann case CEED_EVAL_DIV: 1124241a4b83SYohann break; // TODO: Not implemented 1125241a4b83SYohann case CEED_EVAL_CURL: 1126241a4b83SYohann break; // TODO: Not implemented 1127241a4b83SYohann } 1128241a4b83SYohann } 1129ac421f39SYohann 1130241a4b83SYohann // Q function 1131818e0025SJeremy L Thompson code << "\n // -- Output field setup --\n"; 1132241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1133818e0025SJeremy L Thompson code << "\n // ---- Output field "<<i<<" ----\n"; 1134241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1135e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1136241a4b83SYohann if (emode==CEED_EVAL_GRAD) 1137241a4b83SYohann { 113864d3f0c0Sjeremylt if (useCollograd) { 1139ac421f39SYohann //Accumulator for gradient slices 1140ac421f39SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1141ac421f39SYohann code << " for (CeedInt i = 0; i < ncomp_out_"<<i<<"; ++i) {\n"; 1142ac421f39SYohann code << " for (CeedInt j = 0; j < Q1d; ++j) {\n"; 1143ac421f39SYohann code << " r_tt"<<i<<"[j + i*Q1d] = 0.0;\n"; 1144ac421f39SYohann code << " }\n"; 1145ac421f39SYohann code << " }\n"; 1146ac421f39SYohann } else { 1147241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n"; 1148241a4b83SYohann } 1149ac421f39SYohann } 1150241a4b83SYohann if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP) 1151241a4b83SYohann { 1152241a4b83SYohann code << " CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n"; 1153241a4b83SYohann } 1154241a4b83SYohann } 1155ac421f39SYohann // We treat quadrature points per slice in 3d to save registers 115664d3f0c0Sjeremylt if (useCollograd) { 1157920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1158ac421f39SYohann code << "#pragma unroll\n"; 1159ac421f39SYohann code << " for (CeedInt q=0; q<Q1d; q++) {\n"; 1160818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1161ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1162818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1163ac421f39SYohann // Get elemsize, emode, ncomp 1164ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode); 1165e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1166ac421f39SYohann // Basis action 1167920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1168ac421f39SYohann switch (emode) { 1169ac421f39SYohann case CEED_EVAL_NONE: 1170ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 11719a2291e3SJeremy L Thompson 11729a2291e3SJeremy L Thompson bool isStrided; 1173e15f9bd0SJeremy L Thompson ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict); CeedChkBackend(ierr); 1174e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 11759a2291e3SJeremy L Thompson if (!isStrided) { 11765c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1177e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 11785c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 11795c7b696cSjeremylt CeedInt compstride; 1180e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 11815c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1182e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 11839a2291e3SJeremy L Thompson data->indices.in[i] = restr_data->d_ind; 11845c7b696cSjeremylt code << " readSliceQuadsOffset"<<"3d<ncomp_in_"<<i<<", "<<compstride<<", Q1d>(data, lsize_in_"<<i<<", elem, q, indices.in["<<i<<"], d_u"<<i<<", r_q"<<i<<");\n"; 1185920dcdc4Sjeremylt } else { 1186*dc64899eSYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); CeedChkBackend(ierr); 1187920dcdc4Sjeremylt bool backendstrides; 1188fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1189e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 119013585805SJeremy L Thompson CeedInt nelem; 119113585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1192e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 119313585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1194920dcdc4Sjeremylt if (!backendstrides) { 1195920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1196e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1197920dcdc4Sjeremylt } 1198920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1199d80fc06aSjeremylt code << " readSliceQuadsStrided"<<"3d<ncomp_in_"<<i<<",Q1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u"<<i<<", r_q"<<i<<");\n"; 1200920dcdc4Sjeremylt } 1201ac421f39SYohann break; 1202ac421f39SYohann case CEED_EVAL_INTERP: 1203ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"];\n"; 1204ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_in_"<<i<<" ; ++j) {\n"; 1205ac421f39SYohann code << " r_q"<<i<<"[j] = r_t"<<i<<"[q + j*Q1d];\n"; 1206ac421f39SYohann code << " }\n"; 1207ac421f39SYohann break; 1208ac421f39SYohann case CEED_EVAL_GRAD: 1209ac421f39SYohann code << " CeedScalar r_q"<<i<<"[ncomp_in_"<<i<<"*Dim];\n"; 1210ac421f39SYohann code << " gradCollo3d<ncomp_in_"<<i<<",Q1d>(data, q, r_t"<<i<<", s_G_in_"<<i<<", r_q"<<i<<");\n"; 1211ac421f39SYohann break; 1212ac421f39SYohann case CEED_EVAL_WEIGHT: 1213ac421f39SYohann code << " CeedScalar r_q"<<i<<"[1];\n"; 1214ac421f39SYohann code << " r_q"<<i<<"[0] = r_t"<<i<<"[q];\n"; 1215ac421f39SYohann break; // No action 1216ac421f39SYohann case CEED_EVAL_DIV: 1217ac421f39SYohann break; // TODO: Not implemented 1218ac421f39SYohann case CEED_EVAL_CURL: 1219ac421f39SYohann break; // TODO: Not implemented 1220ac421f39SYohann } 1221ac421f39SYohann } 1222818e0025SJeremy L Thompson code << "\n // -- Output fields --\n"; 1223ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1224818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1225ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1226e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1227ac421f39SYohann // Basis action 1228ac421f39SYohann switch (emode) { 1229ac421f39SYohann case CEED_EVAL_NONE: 1230ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1231ac421f39SYohann break; // No action 1232ac421f39SYohann case CEED_EVAL_INTERP: 1233ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"];\n"; 1234ac421f39SYohann break; 1235ac421f39SYohann case CEED_EVAL_GRAD: 1236ac421f39SYohann code << " CeedScalar r_qq"<<i<<"[ncomp_out_"<<i<<"*Dim];\n"; 1237ac421f39SYohann break; 1238ac421f39SYohann case CEED_EVAL_WEIGHT: 1239ac421f39SYohann break; // Should not occur 1240ac421f39SYohann case CEED_EVAL_DIV: 1241ac421f39SYohann break; // TODO: Not implemented 1242ac421f39SYohann case CEED_EVAL_CURL: 1243ac421f39SYohann break; // TODO: Not implemented 1244ac421f39SYohann } 1245ac421f39SYohann } 1246ac421f39SYohann } else { 1247920dcdc4Sjeremylt code << "\n // Note: No Collocated Gradient\n"; 1248818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 1249ac421f39SYohann for (CeedInt i = 0; i < numinputfields; i++) { 1250818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1251ac421f39SYohann code << " CeedScalar* r_q"<<i<<" = r_t"<<i<<";\n"; 1252ac421f39SYohann } 1253818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1254ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1255818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1256ac421f39SYohann code << " CeedScalar* r_qq"<<i<<" = r_tt"<<i<<";\n"; 1257ac421f39SYohann } 1258ac421f39SYohann } 1259818e0025SJeremy L Thompson code << "\n // -- QFunction Inputs and outputs --\n"; 12604d537eeaSYohann code << " CeedScalar* in["<<numinputfields<<"];\n"; 12614d537eeaSYohann for (CeedInt i = 0; i < numinputfields; i++) { 1262818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 1263ac421f39SYohann code << " in["<<i<<"] = r_q"<<i<<";\n"; 12644d537eeaSYohann } 12654d537eeaSYohann code << " CeedScalar* out["<<numoutputfields<<"];\n"; 12664d537eeaSYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1267818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1268ac421f39SYohann code << " out["<<i<<"] = r_qq"<<i<<";\n"; 12694d537eeaSYohann } 1270818e0025SJeremy L Thompson code << "\n // -- Apply QFunction --\n"; 1271ac421f39SYohann code << " "<<qFunctionName<<"(ctx, "; 127264d3f0c0Sjeremylt if (dim != 3 || useCollograd) { 1273ac421f39SYohann code << "1"; 1274ac421f39SYohann } else { 1275ac421f39SYohann code << "Q1d"; 1276ac421f39SYohann } 1277ac421f39SYohann code << ", in, out);\n"; 127864d3f0c0Sjeremylt if (useCollograd) { 1279920dcdc4Sjeremylt code << "\n // Note: Collocated Gradient\n"; 1280818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 1281ac421f39SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1282818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1283ac421f39SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1284e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1285ac421f39SYohann // Basis action 1286920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1287ac421f39SYohann switch (emode) { 1288ac421f39SYohann case CEED_EVAL_NONE: 1289ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1290ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1291ac421f39SYohann code << " }\n"; 1292ac421f39SYohann break; // No action 1293ac421f39SYohann case CEED_EVAL_INTERP: 1294ac421f39SYohann code << " for (CeedInt j = 0; j < ncomp_out_"<<i<<" ; ++j) {\n"; 1295ac421f39SYohann code << " r_tt"<<i<<"[q + j*Q1d] = r_qq"<<i<<"[j];\n"; 1296ac421f39SYohann code << " }\n"; 1297ac421f39SYohann break; 1298ac421f39SYohann case CEED_EVAL_GRAD: 1299ac421f39SYohann code << " gradColloTranspose3d<ncomp_out_"<<i<<",Q1d>(data, q, r_qq"<<i<<", s_G_out_"<<i<<", r_tt"<<i<<");\n"; 1300ac421f39SYohann break; 1301ac421f39SYohann case CEED_EVAL_WEIGHT: 1302ac421f39SYohann break; // Should not occur 1303ac421f39SYohann case CEED_EVAL_DIV: 1304ac421f39SYohann break; // TODO: Not implemented 1305ac421f39SYohann case CEED_EVAL_CURL: 1306ac421f39SYohann break; // TODO: Not implemented 1307ac421f39SYohann } 1308ac421f39SYohann } 1309ac421f39SYohann code << " }\n"; 1310ac421f39SYohann } 1311241a4b83SYohann 1312241a4b83SYohann // Output basis apply if needed 1313ac421f39SYohann // Generate the correct eval mode code for each output 1314818e0025SJeremy L Thompson code << "\n // -- Output field basis action and restrictions --\n"; 1315241a4b83SYohann for (CeedInt i = 0; i < numoutputfields; i++) { 1316818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 1317241a4b83SYohann // Get elemsize, emode, ncomp 1318241a4b83SYohann ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict); 1319e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1320241a4b83SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize); 1321e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1322241a4b83SYohann ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode); 1323e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13244d537eeaSYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &ncomp); 1325e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1326241a4b83SYohann // Basis action 1327920dcdc4Sjeremylt code << " // EvalMode: "<<CeedEvalModes[emode]<<"\n"; 1328241a4b83SYohann switch (emode) { 1329241a4b83SYohann case CEED_EVAL_NONE: 1330920dcdc4Sjeremylt code << " CeedScalar* r_v"<<i<<" = r_tt"<<i<<";\n"; 1331241a4b83SYohann break; // No action 1332241a4b83SYohann case CEED_EVAL_INTERP: 1333241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 1334241a4b83SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1335241a4b83SYohann break; 1336241a4b83SYohann case CEED_EVAL_GRAD: 1337241a4b83SYohann code << " CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n"; 133864d3f0c0Sjeremylt if (useCollograd) { 1339ac421f39SYohann code << " interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n"; 1340ac421f39SYohann } else { 1341241a4b83SYohann code << " gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n"; 1342ac421f39SYohann } 1343241a4b83SYohann break; 1344e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 1345241a4b83SYohann case CEED_EVAL_WEIGHT: { 1346241a4b83SYohann Ceed ceed; 1347e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 1348e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1349241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 1350241a4b83SYohann break; // Should not occur 1351241a4b83SYohann } 1352241a4b83SYohann case CEED_EVAL_DIV: 1353241a4b83SYohann break; // TODO: Not implemented 1354241a4b83SYohann case CEED_EVAL_CURL: 1355241a4b83SYohann break; // TODO: Not implemented 1356e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1357241a4b83SYohann } 1358920dcdc4Sjeremylt // Restriction 13599a2291e3SJeremy L Thompson bool isStrided; 1360e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(Erestrict, &isStrided); CeedChkBackend(ierr); 13619a2291e3SJeremy L Thompson if (!isStrided) { 13625c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 1363e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 13645c7b696cSjeremylt code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 13655c7b696cSjeremylt CeedInt compstride; 1366e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(Erestrict, &compstride); CeedChkBackend(ierr); 13675c7b696cSjeremylt code << " // CompStride: "<<compstride<<"\n"; 1368e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 13699a2291e3SJeremy L Thompson data->indices.out[i] = restr_data->d_ind; 13705c7b696cSjeremylt code << " writeDofsOffset"<<dim<<"d<ncomp_out_"<<i<<", "<<compstride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n"; 1371920dcdc4Sjeremylt } else { 1372920dcdc4Sjeremylt bool backendstrides; 1373fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 1374e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 137513585805SJeremy L Thompson CeedInt nelem; 137613585805SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(Erestrict, &nelem); 1377e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 137813585805SJeremy L Thompson CeedInt strides[3] = {1, elemsize*nelem, elemsize}; 1379920dcdc4Sjeremylt if (!backendstrides) { 1380920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 1381e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1382920dcdc4Sjeremylt } 1383920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 1384d80fc06aSjeremylt code << " writeDofsStrided"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v"<<i<<", d_v"<<i<<");\n"; 1385920dcdc4Sjeremylt } 1386241a4b83SYohann } 1387241a4b83SYohann 1388241a4b83SYohann code << " }\n"; 1389818e0025SJeremy L Thompson code << "}\n"; 1390818e0025SJeremy L Thompson code << "// -----------------------------------------------------------------------------\n\n"; 1391241a4b83SYohann 1392ab213215SJeremy L Thompson // View kernel for debugging 139346dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 13943f21f6b1SJeremy L Thompson CeedDebug(ceed, code.str().c_str()); 1395241a4b83SYohann 139618d499f1SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 139718d499f1SYohann "T1d", CeedIntMax(Q1d, data->maxP1d)); 1398e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1399c3c443faSYohann Dudouit ierr = CeedGetKernelCuda(ceed, data->module, oper.c_str(), &data->op); 1400e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1401241a4b83SYohann 1402e15f9bd0SJeremy L Thompson ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 1403e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1404241a4b83SYohann } 1405ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1406