1*9e1d4b82SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2*9e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*9e1d4b82SJeremy L Thompson // 4*9e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5*9e1d4b82SJeremy L Thompson // 6*9e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7*9e1d4b82SJeremy L Thompson 8*9e1d4b82SJeremy L Thompson /// @file 9*9e1d4b82SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation 10*9e1d4b82SJeremy L Thompson #include <ceed/types.h> 11*9e1d4b82SJeremy L Thompson 12*9e1d4b82SJeremy L Thompson #include "cuda-shared-basis-read-write-templates.h" 13*9e1d4b82SJeremy L Thompson #include "cuda-shared-basis-tensor-at-points-templates.h" 14*9e1d4b82SJeremy L Thompson #include "cuda-shared-basis-tensor-templates.h" 15*9e1d4b82SJeremy L Thompson 16*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 17*9e1d4b82SJeremy L Thompson // Tensor Basis Kernels AtPoints 18*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 19*9e1d4b82SJeremy L Thompson 20*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 21*9e1d4b82SJeremy L Thompson // Interp 22*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 23*9e1d4b82SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, 24*9e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 25*9e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 26*9e1d4b82SJeremy L Thompson 27*9e1d4b82SJeremy L Thompson SharedData_Cuda data; 28*9e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 29*9e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 30*9e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 31*9e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 32*9e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 33*9e1d4b82SJeremy L Thompson 34*9e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 35*9e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 36*9e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 37*9e1d4b82SJeremy L Thompson 38*9e1d4b82SJeremy L Thompson // Apply basis element by element 39*9e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 40*9e1d4b82SJeremy L Thompson // Map to coefficients 41*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 42*9e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 43*9e1d4b82SJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C); 44*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 45*9e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 46*9e1d4b82SJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C); 47*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 48*9e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 49*9e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 50*9e1d4b82SJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C); 51*9e1d4b82SJeremy L Thompson } 52*9e1d4b82SJeremy L Thompson 53*9e1d4b82SJeremy L Thompson // Map to points 54*9e1d4b82SJeremy L Thompson const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 55*9e1d4b82SJeremy L Thompson 56*9e1d4b82SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 57*9e1d4b82SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 58*9e1d4b82SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 59*9e1d4b82SJeremy L Thompson 60*9e1d4b82SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 61*9e1d4b82SJeremy L Thompson r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p]; 62*9e1d4b82SJeremy L Thompson } 63*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 64*9e1d4b82SJeremy L Thompson InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 65*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 66*9e1d4b82SJeremy L Thompson InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 67*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 68*9e1d4b82SJeremy L Thompson InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 69*9e1d4b82SJeremy L Thompson } 70*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) { 71*9e1d4b82SJeremy L Thompson if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j]; 72*9e1d4b82SJeremy L Thompson } 73*9e1d4b82SJeremy L Thompson } 74*9e1d4b82SJeremy L Thompson } 75*9e1d4b82SJeremy L Thompson } 76*9e1d4b82SJeremy L Thompson 77*9e1d4b82SJeremy L Thompson extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 78*9e1d4b82SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 79*9e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 80*9e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 81*9e1d4b82SJeremy L Thompson 82*9e1d4b82SJeremy L Thompson SharedData_Cuda data; 83*9e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 84*9e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 85*9e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 86*9e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 87*9e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 88*9e1d4b82SJeremy L Thompson 89*9e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 90*9e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 91*9e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 92*9e1d4b82SJeremy L Thompson 93*9e1d4b82SJeremy L Thompson // Apply basis element by element 94*9e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 95*9e1d4b82SJeremy L Thompson // Clear register 96*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 97*9e1d4b82SJeremy L Thompson 98*9e1d4b82SJeremy L Thompson // Map from points 99*9e1d4b82SJeremy L Thompson const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 100*9e1d4b82SJeremy L Thompson 101*9e1d4b82SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 102*9e1d4b82SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 103*9e1d4b82SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 104*9e1d4b82SJeremy L Thompson 105*9e1d4b82SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 106*9e1d4b82SJeremy L Thompson r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p]; 107*9e1d4b82SJeremy L Thompson } 108*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) { 109*9e1d4b82SJeremy L Thompson if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p]; 110*9e1d4b82SJeremy L Thompson else r_U[j] = 0.0; 111*9e1d4b82SJeremy L Thompson } 112*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 113*9e1d4b82SJeremy L Thompson InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 114*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 115*9e1d4b82SJeremy L Thompson InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 116*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 117*9e1d4b82SJeremy L Thompson InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 118*9e1d4b82SJeremy L Thompson } 119*9e1d4b82SJeremy L Thompson } 120*9e1d4b82SJeremy L Thompson __syncthreads(); 121*9e1d4b82SJeremy L Thompson 122*9e1d4b82SJeremy L Thompson // Map from coefficients 123*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 124*9e1d4b82SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V); 125*9e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 126*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 127*9e1d4b82SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V); 128*9e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 129*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 130*9e1d4b82SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V); 131*9e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 132*9e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 133*9e1d4b82SJeremy L Thompson } 134*9e1d4b82SJeremy L Thompson } 135*9e1d4b82SJeremy L Thompson } 136*9e1d4b82SJeremy L Thompson 137*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 138*9e1d4b82SJeremy L Thompson // Grad 139*9e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 140*9e1d4b82SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, 141*9e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 142*9e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 143*9e1d4b82SJeremy L Thompson 144*9e1d4b82SJeremy L Thompson SharedData_Cuda data; 145*9e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 146*9e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 147*9e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 148*9e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 149*9e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 150*9e1d4b82SJeremy L Thompson 151*9e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 152*9e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 153*9e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 154*9e1d4b82SJeremy L Thompson 155*9e1d4b82SJeremy L Thompson // Apply basis element by element 156*9e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 157*9e1d4b82SJeremy L Thompson // Map to coefficients 158*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 159*9e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 160*9e1d4b82SJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C); 161*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 162*9e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 163*9e1d4b82SJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C); 164*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 165*9e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 166*9e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 167*9e1d4b82SJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C); 168*9e1d4b82SJeremy L Thompson } 169*9e1d4b82SJeremy L Thompson 170*9e1d4b82SJeremy L Thompson // Map to points 171*9e1d4b82SJeremy L Thompson const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 172*9e1d4b82SJeremy L Thompson 173*9e1d4b82SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 174*9e1d4b82SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 175*9e1d4b82SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 176*9e1d4b82SJeremy L Thompson 177*9e1d4b82SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 178*9e1d4b82SJeremy L Thompson r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p]; 179*9e1d4b82SJeremy L Thompson } 180*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 181*9e1d4b82SJeremy L Thompson GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 182*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 183*9e1d4b82SJeremy L Thompson GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 184*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 185*9e1d4b82SJeremy L Thompson GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 186*9e1d4b82SJeremy L Thompson } 187*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) { 188*9e1d4b82SJeremy L Thompson if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j]; 189*9e1d4b82SJeremy L Thompson } 190*9e1d4b82SJeremy L Thompson } 191*9e1d4b82SJeremy L Thompson } 192*9e1d4b82SJeremy L Thompson } 193*9e1d4b82SJeremy L Thompson 194*9e1d4b82SJeremy L Thompson extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 195*9e1d4b82SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 196*9e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 197*9e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 198*9e1d4b82SJeremy L Thompson 199*9e1d4b82SJeremy L Thompson SharedData_Cuda data; 200*9e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 201*9e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 202*9e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 203*9e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 204*9e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 205*9e1d4b82SJeremy L Thompson 206*9e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 207*9e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 208*9e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 209*9e1d4b82SJeremy L Thompson 210*9e1d4b82SJeremy L Thompson // Apply basis element by element 211*9e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 212*9e1d4b82SJeremy L Thompson // Clear register 213*9e1d4b82SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 214*9e1d4b82SJeremy L Thompson 215*9e1d4b82SJeremy L Thompson // Map from points 216*9e1d4b82SJeremy L Thompson const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 217*9e1d4b82SJeremy L Thompson 218*9e1d4b82SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 219*9e1d4b82SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 220*9e1d4b82SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 221*9e1d4b82SJeremy L Thompson 222*9e1d4b82SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 223*9e1d4b82SJeremy L Thompson r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p]; 224*9e1d4b82SJeremy L Thompson } 225*9e1d4b82SJeremy L Thompson for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) { 226*9e1d4b82SJeremy L Thompson if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p]; 227*9e1d4b82SJeremy L Thompson else r_U[j] = 0.0; 228*9e1d4b82SJeremy L Thompson } 229*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 230*9e1d4b82SJeremy L Thompson GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 231*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 232*9e1d4b82SJeremy L Thompson GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 233*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 234*9e1d4b82SJeremy L Thompson GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 235*9e1d4b82SJeremy L Thompson } 236*9e1d4b82SJeremy L Thompson } 237*9e1d4b82SJeremy L Thompson __syncthreads(); 238*9e1d4b82SJeremy L Thompson 239*9e1d4b82SJeremy L Thompson // Map from coefficients 240*9e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 241*9e1d4b82SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V); 242*9e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 243*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 244*9e1d4b82SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V); 245*9e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 246*9e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 247*9e1d4b82SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V); 248*9e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 249*9e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 250*9e1d4b82SJeremy L Thompson } 251*9e1d4b82SJeremy L Thompson } 252*9e1d4b82SJeremy L Thompson } 253