1*9e1d4b82SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2*9e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*9e1d4b82SJeremy L Thompson // 4*9e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5*9e1d4b82SJeremy L Thompson // 6*9e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7*9e1d4b82SJeremy L Thompson 8*9e1d4b82SJeremy L Thompson /// @file 9*9e1d4b82SJeremy L Thompson /// Internal header for CUDA shared memory tensor product basis AtPoints templates 10*9e1d4b82SJeremy L Thompson #include <ceed/types.h> 11*9e1d4b82SJeremy L Thompson 12*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 13*9e1d4b82SJeremy L Thompson // Chebyshev values 14*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 15*9e1d4b82SJeremy L Thompson template <int Q_1D> 16*9e1d4b82SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 17*9e1d4b82SJeremy L Thompson chebyshev_x[0] = 1.0; 18*9e1d4b82SJeremy L Thompson chebyshev_x[1] = 2 * x; 19*9e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 20*9e1d4b82SJeremy L Thompson } 21*9e1d4b82SJeremy L Thompson 22*9e1d4b82SJeremy L Thompson template <int Q_1D> 23*9e1d4b82SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 24*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[3]; 25*9e1d4b82SJeremy L Thompson 26*9e1d4b82SJeremy L Thompson chebyshev_x[1] = 1.0; 27*9e1d4b82SJeremy L Thompson chebyshev_x[2] = 2 * x; 28*9e1d4b82SJeremy L Thompson chebyshev_dx[0] = 0.0; 29*9e1d4b82SJeremy L Thompson chebyshev_dx[1] = 2.0; 30*9e1d4b82SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 31*9e1d4b82SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 32*9e1d4b82SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 33*9e1d4b82SJeremy L Thompson } 34*9e1d4b82SJeremy L Thompson } 35*9e1d4b82SJeremy L Thompson 36*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 37*9e1d4b82SJeremy L Thompson // 1D 38*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 39*9e1d4b82SJeremy L Thompson 40*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 41*9e1d4b82SJeremy L Thompson // 1D interpolate to points 42*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 43*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 44*9e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 45*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 46*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 47*9e1d4b82SJeremy L Thompson 48*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 49*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 50*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 51*9e1d4b82SJeremy L Thompson // Load coefficients 52*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 53*9e1d4b82SJeremy L Thompson __syncthreads(); 54*9e1d4b82SJeremy L Thompson // Contract x direction 55*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 56*9e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i]; 57*9e1d4b82SJeremy L Thompson } 58*9e1d4b82SJeremy L Thompson } 59*9e1d4b82SJeremy L Thompson } 60*9e1d4b82SJeremy L Thompson 61*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 62*9e1d4b82SJeremy L Thompson // 1D interpolate transpose 63*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 64*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 65*9e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 66*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 67*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 68*9e1d4b82SJeremy L Thompson 69*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 70*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 71*9e1d4b82SJeremy L Thompson // Clear shared memory 72*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 73*9e1d4b82SJeremy L Thompson __syncthreads(); 74*9e1d4b82SJeremy L Thompson // Contract x direction 75*9e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 76*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 77*9e1d4b82SJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); 78*9e1d4b82SJeremy L Thompson } 79*9e1d4b82SJeremy L Thompson } 80*9e1d4b82SJeremy L Thompson // Pull from shared to register 81*9e1d4b82SJeremy L Thompson __syncthreads(); 82*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p]; 83*9e1d4b82SJeremy L Thompson } 84*9e1d4b82SJeremy L Thompson } 85*9e1d4b82SJeremy L Thompson 86*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 87*9e1d4b82SJeremy L Thompson // 1D derivatives at points 88*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 89*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 90*9e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 91*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 92*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 93*9e1d4b82SJeremy L Thompson 94*9e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 95*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 96*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 97*9e1d4b82SJeremy L Thompson // Load coefficients 98*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 99*9e1d4b82SJeremy L Thompson __syncthreads(); 100*9e1d4b82SJeremy L Thompson // Contract x direction 101*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 102*9e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * data.slice[i]; 103*9e1d4b82SJeremy L Thompson } 104*9e1d4b82SJeremy L Thompson } 105*9e1d4b82SJeremy L Thompson } 106*9e1d4b82SJeremy L Thompson 107*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 108*9e1d4b82SJeremy L Thompson // 1D derivatives transpose 109*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 110*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 111*9e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 112*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 113*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 114*9e1d4b82SJeremy L Thompson 115*9e1d4b82SJeremy L Thompson ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 116*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 117*9e1d4b82SJeremy L Thompson // Clear shared memory 118*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 119*9e1d4b82SJeremy L Thompson __syncthreads(); 120*9e1d4b82SJeremy L Thompson // Contract x direction 121*9e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 122*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 123*9e1d4b82SJeremy L Thompson atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); 124*9e1d4b82SJeremy L Thompson } 125*9e1d4b82SJeremy L Thompson } 126*9e1d4b82SJeremy L Thompson // Pull from shared to register 127*9e1d4b82SJeremy L Thompson __syncthreads(); 128*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p]; 129*9e1d4b82SJeremy L Thompson } 130*9e1d4b82SJeremy L Thompson } 131*9e1d4b82SJeremy L Thompson 132*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 133*9e1d4b82SJeremy L Thompson // 2D 134*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 135*9e1d4b82SJeremy L Thompson 136*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 137*9e1d4b82SJeremy L Thompson // 2D interpolate to points 138*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 139*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 140*9e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 141*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 142*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 143*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 144*9e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 145*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 146*9e1d4b82SJeremy L Thompson 147*9e1d4b82SJeremy L Thompson // Load coefficients 148*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 149*9e1d4b82SJeremy L Thompson __syncthreads(); 150*9e1d4b82SJeremy L Thompson // Contract x direction 151*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 152*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 153*9e1d4b82SJeremy L Thompson buffer[i] = 0.0; 154*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 155*9e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 156*9e1d4b82SJeremy L Thompson } 157*9e1d4b82SJeremy L Thompson } 158*9e1d4b82SJeremy L Thompson // Contract y direction 159*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 160*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 161*9e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i]; 162*9e1d4b82SJeremy L Thompson } 163*9e1d4b82SJeremy L Thompson } 164*9e1d4b82SJeremy L Thompson } 165*9e1d4b82SJeremy L Thompson 166*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 167*9e1d4b82SJeremy L Thompson // 2D interpolate transpose 168*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 169*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 170*9e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 171*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 172*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 173*9e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 174*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 175*9e1d4b82SJeremy L Thompson 176*9e1d4b82SJeremy L Thompson // Clear shared memory 177*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 178*9e1d4b82SJeremy L Thompson __syncthreads(); 179*9e1d4b82SJeremy L Thompson // Contract y direction 180*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 181*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 182*9e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp]; 183*9e1d4b82SJeremy L Thompson } 184*9e1d4b82SJeremy L Thompson // Contract x direction 185*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 186*9e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 187*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 188*9e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 189*9e1d4b82SJeremy L Thompson const CeedInt ii = (i + (p / Q_1D)) % Q_1D; 190*9e1d4b82SJeremy L Thompson 191*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 192*9e1d4b82SJeremy L Thompson const CeedInt jj = (j + p) % Q_1D; 193*9e1d4b82SJeremy L Thompson 194*9e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 195*9e1d4b82SJeremy L Thompson } 196*9e1d4b82SJeremy L Thompson } 197*9e1d4b82SJeremy L Thompson } 198*9e1d4b82SJeremy L Thompson // Pull from shared to register 199*9e1d4b82SJeremy L Thompson __syncthreads(); 200*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 201*9e1d4b82SJeremy L Thompson } 202*9e1d4b82SJeremy L Thompson } 203*9e1d4b82SJeremy L Thompson 204*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 205*9e1d4b82SJeremy L Thompson // 2D derivatives at points 206*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 207*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 208*9e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 209*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 210*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; 211*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 212*9e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 213*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 214*9e1d4b82SJeremy L Thompson 215*9e1d4b82SJeremy L Thompson // Load coefficients 216*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 217*9e1d4b82SJeremy L Thompson __syncthreads(); 218*9e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) { 219*9e1d4b82SJeremy L Thompson // Contract x direction 220*9e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 221*9e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 222*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 223*9e1d4b82SJeremy L Thompson buffer[i] = 0.0; 224*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 225*9e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 226*9e1d4b82SJeremy L Thompson } 227*9e1d4b82SJeremy L Thompson } 228*9e1d4b82SJeremy L Thompson // Contract y direction 229*9e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 230*9e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 231*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 232*9e1d4b82SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; 233*9e1d4b82SJeremy L Thompson } 234*9e1d4b82SJeremy L Thompson } 235*9e1d4b82SJeremy L Thompson } 236*9e1d4b82SJeremy L Thompson } 237*9e1d4b82SJeremy L Thompson 238*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 239*9e1d4b82SJeremy L Thompson // 2D derivatives transpose 240*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 241*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 242*9e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 243*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 244*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 245*9e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 246*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 247*9e1d4b82SJeremy L Thompson 248*9e1d4b82SJeremy L Thompson // Clear shared memory 249*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 250*9e1d4b82SJeremy L Thompson __syncthreads(); 251*9e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 2; dim++) { 252*9e1d4b82SJeremy L Thompson // Contract y direction 253*9e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 254*9e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 255*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 256*9e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP]; 257*9e1d4b82SJeremy L Thompson } 258*9e1d4b82SJeremy L Thompson // Contract x direction 259*9e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 260*9e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 261*9e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 262*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 263*9e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 264*9e1d4b82SJeremy L Thompson const CeedInt ii = (i + (p / Q_1D)) % Q_1D; 265*9e1d4b82SJeremy L Thompson 266*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 267*9e1d4b82SJeremy L Thompson const CeedInt jj = (j + p) % Q_1D; 268*9e1d4b82SJeremy L Thompson 269*9e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 270*9e1d4b82SJeremy L Thompson } 271*9e1d4b82SJeremy L Thompson } 272*9e1d4b82SJeremy L Thompson } 273*9e1d4b82SJeremy L Thompson } 274*9e1d4b82SJeremy L Thompson // Pull from shared to register 275*9e1d4b82SJeremy L Thompson __syncthreads(); 276*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 277*9e1d4b82SJeremy L Thompson } 278*9e1d4b82SJeremy L Thompson } 279*9e1d4b82SJeremy L Thompson 280*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 281*9e1d4b82SJeremy L Thompson // 3D 282*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 283*9e1d4b82SJeremy L Thompson 284*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 285*9e1d4b82SJeremy L Thompson // 3D interpolate to points 286*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 287*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 288*9e1d4b82SJeremy L Thompson inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 289*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 290*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 291*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 292*9e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 293*9e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 294*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 295*9e1d4b82SJeremy L Thompson 296*9e1d4b82SJeremy L Thompson // Load coefficients 297*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 298*9e1d4b82SJeremy L Thompson __syncthreads(); 299*9e1d4b82SJeremy L Thompson // Contract x direction 300*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 301*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 302*9e1d4b82SJeremy L Thompson buffer[i] = 0.0; 303*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 304*9e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 305*9e1d4b82SJeremy L Thompson } 306*9e1d4b82SJeremy L Thompson } 307*9e1d4b82SJeremy L Thompson // Contract y and z direction 308*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 309*9e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 310*9e1d4b82SJeremy L Thompson 311*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 312*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 313*9e1d4b82SJeremy L Thompson r_V[comp] += chebyshev_x[i] * buffer[i] * z; 314*9e1d4b82SJeremy L Thompson } 315*9e1d4b82SJeremy L Thompson } 316*9e1d4b82SJeremy L Thompson } 317*9e1d4b82SJeremy L Thompson } 318*9e1d4b82SJeremy L Thompson 319*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 320*9e1d4b82SJeremy L Thompson // 3D interpolate transpose 321*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 322*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 323*9e1d4b82SJeremy L Thompson inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 324*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 325*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 326*9e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 327*9e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 328*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 329*9e1d4b82SJeremy L Thompson 330*9e1d4b82SJeremy L Thompson // Clear shared memory 331*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 332*9e1d4b82SJeremy L Thompson __syncthreads(); 333*9e1d4b82SJeremy L Thompson // Contract y and z direction 334*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 335*9e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 336*9e1d4b82SJeremy L Thompson 337*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 338*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 339*9e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp] * z; 340*9e1d4b82SJeremy L Thompson } 341*9e1d4b82SJeremy L Thompson // Contract x direction 342*9e1d4b82SJeremy L Thompson ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 343*9e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 344*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 345*9e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 346*9e1d4b82SJeremy L Thompson const CeedInt ii = (i + (p / Q_1D)) % Q_1D; 347*9e1d4b82SJeremy L Thompson 348*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 349*9e1d4b82SJeremy L Thompson const CeedInt jj = ((j + p) % Q_1D); 350*9e1d4b82SJeremy L Thompson 351*9e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 352*9e1d4b82SJeremy L Thompson } 353*9e1d4b82SJeremy L Thompson } 354*9e1d4b82SJeremy L Thompson } 355*9e1d4b82SJeremy L Thompson // Pull from shared to register 356*9e1d4b82SJeremy L Thompson __syncthreads(); 357*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 358*9e1d4b82SJeremy L Thompson } 359*9e1d4b82SJeremy L Thompson } 360*9e1d4b82SJeremy L Thompson } 361*9e1d4b82SJeremy L Thompson 362*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 363*9e1d4b82SJeremy L Thompson // 3D derivatives at points 364*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 365*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 366*9e1d4b82SJeremy L Thompson inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 367*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_V) { 368*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; 369*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 370*9e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 371*9e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 372*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 373*9e1d4b82SJeremy L Thompson 374*9e1d4b82SJeremy L Thompson // Load coefficients 375*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 376*9e1d4b82SJeremy L Thompson __syncthreads(); 377*9e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) { 378*9e1d4b82SJeremy L Thompson // Contract x direction 379*9e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 380*9e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 381*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 382*9e1d4b82SJeremy L Thompson buffer[i] = 0.0; 383*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 384*9e1d4b82SJeremy L Thompson buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 385*9e1d4b82SJeremy L Thompson } 386*9e1d4b82SJeremy L Thompson } 387*9e1d4b82SJeremy L Thompson // Contract y and z direction 388*9e1d4b82SJeremy L Thompson if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 389*9e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 390*9e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 391*9e1d4b82SJeremy L Thompson 392*9e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 393*9e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 394*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 395*9e1d4b82SJeremy L Thompson r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z; 396*9e1d4b82SJeremy L Thompson } 397*9e1d4b82SJeremy L Thompson } 398*9e1d4b82SJeremy L Thompson } 399*9e1d4b82SJeremy L Thompson } 400*9e1d4b82SJeremy L Thompson } 401*9e1d4b82SJeremy L Thompson 402*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 403*9e1d4b82SJeremy L Thompson // 3D derivatives transpose 404*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 405*9e1d4b82SJeremy L Thompson template <int NUM_COMP, int NUM_POINTS, int Q_1D> 406*9e1d4b82SJeremy L Thompson inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 407*9e1d4b82SJeremy L Thompson CeedScalar *__restrict__ r_C) { 408*9e1d4b82SJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 409*9e1d4b82SJeremy L Thompson for (CeedInt k = 0; k < Q_1D; k++) { 410*9e1d4b82SJeremy L Thompson CeedScalar buffer[Q_1D]; 411*9e1d4b82SJeremy L Thompson CeedScalar chebyshev_x[Q_1D]; 412*9e1d4b82SJeremy L Thompson 413*9e1d4b82SJeremy L Thompson // Clear shared memory 414*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 415*9e1d4b82SJeremy L Thompson __syncthreads(); 416*9e1d4b82SJeremy L Thompson for (CeedInt dim = 0; dim < 3; dim++) { 417*9e1d4b82SJeremy L Thompson // Contract y and z direction 418*9e1d4b82SJeremy L Thompson if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 419*9e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 420*9e1d4b82SJeremy L Thompson const CeedScalar z = chebyshev_x[k]; 421*9e1d4b82SJeremy L Thompson 422*9e1d4b82SJeremy L Thompson if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 423*9e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 424*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 425*9e1d4b82SJeremy L Thompson buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z; 426*9e1d4b82SJeremy L Thompson } 427*9e1d4b82SJeremy L Thompson // Contract x direction 428*9e1d4b82SJeremy L Thompson if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 429*9e1d4b82SJeremy L Thompson else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 430*9e1d4b82SJeremy L Thompson if (p < NUM_POINTS) { 431*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) { 432*9e1d4b82SJeremy L Thompson // Note: shifting to avoid atomic adds 433*9e1d4b82SJeremy L Thompson const CeedInt ii = (i + (p / Q_1D)) % Q_1D; 434*9e1d4b82SJeremy L Thompson 435*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < Q_1D; j++) { 436*9e1d4b82SJeremy L Thompson const CeedInt jj = ((j + p) % Q_1D); 437*9e1d4b82SJeremy L Thompson 438*9e1d4b82SJeremy L Thompson atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 439*9e1d4b82SJeremy L Thompson } 440*9e1d4b82SJeremy L Thompson } 441*9e1d4b82SJeremy L Thompson } 442*9e1d4b82SJeremy L Thompson } 443*9e1d4b82SJeremy L Thompson // Pull from shared to register 444*9e1d4b82SJeremy L Thompson __syncthreads(); 445*9e1d4b82SJeremy L Thompson if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 446*9e1d4b82SJeremy L Thompson } 447*9e1d4b82SJeremy L Thompson } 448*9e1d4b82SJeremy L Thompson } 449