1*9e1d4b82SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2*9e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*9e1d4b82SJeremy L Thompson // 4*9e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5*9e1d4b82SJeremy L Thompson // 6*9e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7*9e1d4b82SJeremy L Thompson 8*9e1d4b82SJeremy L Thompson /// @file 9*9e1d4b82SJeremy L Thompson /// Internal header for HIP tensor product basis with AtPoints evaluation 10*9e1d4b82SJeremy L Thompson #include <ceed/types.h> 11*9e1d4b82SJeremy L Thompson 12*9e1d4b82SJeremy L Thompson #include "hip-shared-basis-read-write-templates.h" 13*9e1d4b82SJeremy L Thompson #include "hip-shared-basis-tensor-at-points-templates.h" 14*9e1d4b82SJeremy L Thompson #include "hip-shared-basis-tensor-templates.h" 15*9e1d4b82SJeremy L Thompson 16*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 17*9e1d4b82SJeremy L Thompson // Tensor Basis Kernels AtPoints 18*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 19*9e1d4b82SJeremy L Thompson 20*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 21*9e1d4b82SJeremy L Thompson // Interp 22*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 23*9e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 24*9e1d4b82SJeremy L Thompson void InterpAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, 25*9e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 26*9e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 27*9e1d4b82SJeremy L Thompson 28*9e1d4b82SJeremy L Thompson // load chebyshev_interp_1d into shared memory 29*9e1d4b82SJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 30*9e1d4b82SJeremy L Thompson loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B); 31*9e1d4b82SJeremy L Thompson __syncthreads(); 32*9e1d4b82SJeremy L Thompson 33*9e1d4b82SJeremy L Thompson SharedData_Hip data; 34*9e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 35*9e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 36*9e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 37*9e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 38*9e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 39*9e1d4b82SJeremy L Thompson 40*9e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 41*9e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 42*9e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 43*9e1d4b82SJeremy L Thompson 44*9e1d4b82SJeremy L Thompson // Apply basis element by element 45*9e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 46*9e1d4b82SJeremy L Thompson // Map to coefficients 47*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 48*9e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 49*9e1d4b82SJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 50*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 51*9e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 52*9e1d4b82SJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 53*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 54*9e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 55*9e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 56*9e1d4b82SJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 57*9e1d4b82SJeremy L Thompson } 58*9e1d4b82SJeremy L Thompson 59*9e1d4b82SJeremy L Thompson // Map to points 60*9e1d4b82SJeremy L Thompson const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 61*9e1d4b82SJeremy L Thompson 62*9e1d4b82SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 63*9e1d4b82SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 64*9e1d4b82SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 65*9e1d4b82SJeremy L Thompson 66*9e1d4b82SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 67*9e1d4b82SJeremy L Thompson r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p]; 68*9e1d4b82SJeremy L Thompson } 69*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 70*9e1d4b82SJeremy L Thompson InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 71*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 72*9e1d4b82SJeremy L Thompson InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 73*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 74*9e1d4b82SJeremy L Thompson InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 75*9e1d4b82SJeremy L Thompson } 76*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) { 77*9e1d4b82SJeremy L Thompson if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j]; 78*9e1d4b82SJeremy L Thompson } 79*9e1d4b82SJeremy L Thompson } 80*9e1d4b82SJeremy L Thompson } 81*9e1d4b82SJeremy L Thompson } 82*9e1d4b82SJeremy L Thompson 83*9e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 84*9e1d4b82SJeremy L Thompson void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, 85*9e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 86*9e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 87*9e1d4b82SJeremy L Thompson 88*9e1d4b82SJeremy L Thompson // load chebyshev_interp_1d into shared memory 89*9e1d4b82SJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 90*9e1d4b82SJeremy L Thompson loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B); 91*9e1d4b82SJeremy L Thompson __syncthreads(); 92*9e1d4b82SJeremy L Thompson 93*9e1d4b82SJeremy L Thompson SharedData_Hip data; 94*9e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 95*9e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 96*9e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 97*9e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 98*9e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 99*9e1d4b82SJeremy L Thompson 100*9e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 101*9e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 102*9e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 103*9e1d4b82SJeremy L Thompson 104*9e1d4b82SJeremy L Thompson // Apply basis element by element 105*9e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 106*9e1d4b82SJeremy L Thompson // Clear register 107*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 108*9e1d4b82SJeremy L Thompson 109*9e1d4b82SJeremy L Thompson // Map from points 110*9e1d4b82SJeremy L Thompson const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 111*9e1d4b82SJeremy L Thompson 112*9e1d4b82SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 113*9e1d4b82SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 114*9e1d4b82SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 115*9e1d4b82SJeremy L Thompson 116*9e1d4b82SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 117*9e1d4b82SJeremy L Thompson r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p]; 118*9e1d4b82SJeremy L Thompson } 119*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) { 120*9e1d4b82SJeremy L Thompson if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p]; 121*9e1d4b82SJeremy L Thompson else r_U[j] = 0.0; 122*9e1d4b82SJeremy L Thompson } 123*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 124*9e1d4b82SJeremy L Thompson InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 125*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 126*9e1d4b82SJeremy L Thompson InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 127*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 128*9e1d4b82SJeremy L Thompson InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 129*9e1d4b82SJeremy L Thompson } 130*9e1d4b82SJeremy L Thompson } 131*9e1d4b82SJeremy L Thompson __syncthreads(); 132*9e1d4b82SJeremy L Thompson 133*9e1d4b82SJeremy L Thompson // Map from coefficients 134*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 135*9e1d4b82SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 136*9e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 137*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 138*9e1d4b82SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 139*9e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 140*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 141*9e1d4b82SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 142*9e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 143*9e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 144*9e1d4b82SJeremy L Thompson } 145*9e1d4b82SJeremy L Thompson } 146*9e1d4b82SJeremy L Thompson } 147*9e1d4b82SJeremy L Thompson 148*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 149*9e1d4b82SJeremy L Thompson // Grad 150*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 151*9e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 152*9e1d4b82SJeremy L Thompson void GradAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, 153*9e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 154*9e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 155*9e1d4b82SJeremy L Thompson 156*9e1d4b82SJeremy L Thompson // load chebyshev_interp_1d into shared memory 157*9e1d4b82SJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 158*9e1d4b82SJeremy L Thompson loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B); 159*9e1d4b82SJeremy L Thompson __syncthreads(); 160*9e1d4b82SJeremy L Thompson 161*9e1d4b82SJeremy L Thompson SharedData_Hip data; 162*9e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 163*9e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 164*9e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 165*9e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 166*9e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 167*9e1d4b82SJeremy L Thompson 168*9e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 169*9e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 170*9e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 171*9e1d4b82SJeremy L Thompson 172*9e1d4b82SJeremy L Thompson // Apply basis element by element 173*9e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 174*9e1d4b82SJeremy L Thompson // Map to coefficients 175*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 176*9e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 177*9e1d4b82SJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 178*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 179*9e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 180*9e1d4b82SJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 181*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 182*9e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 183*9e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 184*9e1d4b82SJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 185*9e1d4b82SJeremy L Thompson } 186*9e1d4b82SJeremy L Thompson 187*9e1d4b82SJeremy L Thompson // Map to points 188*9e1d4b82SJeremy L Thompson const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 189*9e1d4b82SJeremy L Thompson 190*9e1d4b82SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 191*9e1d4b82SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 192*9e1d4b82SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 193*9e1d4b82SJeremy L Thompson 194*9e1d4b82SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 195*9e1d4b82SJeremy L Thompson r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p]; 196*9e1d4b82SJeremy L Thompson } 197*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 198*9e1d4b82SJeremy L Thompson GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 199*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 200*9e1d4b82SJeremy L Thompson GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 201*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 202*9e1d4b82SJeremy L Thompson GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 203*9e1d4b82SJeremy L Thompson } 204*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) { 205*9e1d4b82SJeremy L Thompson if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j]; 206*9e1d4b82SJeremy L Thompson } 207*9e1d4b82SJeremy L Thompson } 208*9e1d4b82SJeremy L Thompson } 209*9e1d4b82SJeremy L Thompson } 210*9e1d4b82SJeremy L Thompson 211*9e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 212*9e1d4b82SJeremy L Thompson void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, 213*9e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 214*9e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 215*9e1d4b82SJeremy L Thompson 216*9e1d4b82SJeremy L Thompson // load chebyshev_interp_1d into shared memory 217*9e1d4b82SJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 218*9e1d4b82SJeremy L Thompson loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B); 219*9e1d4b82SJeremy L Thompson __syncthreads(); 220*9e1d4b82SJeremy L Thompson 221*9e1d4b82SJeremy L Thompson SharedData_Hip data; 222*9e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 223*9e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 224*9e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 225*9e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 226*9e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 227*9e1d4b82SJeremy L Thompson 228*9e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 229*9e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 230*9e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 231*9e1d4b82SJeremy L Thompson 232*9e1d4b82SJeremy L Thompson // Apply basis element by element 233*9e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 234*9e1d4b82SJeremy L Thompson // Clear register 235*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 236*9e1d4b82SJeremy L Thompson 237*9e1d4b82SJeremy L Thompson // Map from points 238*9e1d4b82SJeremy L Thompson const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 239*9e1d4b82SJeremy L Thompson 240*9e1d4b82SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 241*9e1d4b82SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 242*9e1d4b82SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 243*9e1d4b82SJeremy L Thompson 244*9e1d4b82SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 245*9e1d4b82SJeremy L Thompson r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p]; 246*9e1d4b82SJeremy L Thompson } 247*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) { 248*9e1d4b82SJeremy L Thompson if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p]; 249*9e1d4b82SJeremy L Thompson else r_U[j] = 0.0; 250*9e1d4b82SJeremy L Thompson } 251*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 252*9e1d4b82SJeremy L Thompson GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 253*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 254*9e1d4b82SJeremy L Thompson GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 255*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 256*9e1d4b82SJeremy L Thompson GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 257*9e1d4b82SJeremy L Thompson } 258*9e1d4b82SJeremy L Thompson } 259*9e1d4b82SJeremy L Thompson __syncthreads(); 260*9e1d4b82SJeremy L Thompson 261*9e1d4b82SJeremy L Thompson // Map from coefficients 262*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 263*9e1d4b82SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 264*9e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 265*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 266*9e1d4b82SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 267*9e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 268*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 269*9e1d4b82SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 270*9e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 271*9e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 272*9e1d4b82SJeremy L Thompson } 273*9e1d4b82SJeremy L Thompson } 274*9e1d4b82SJeremy L Thompson } 275