19e1d4b82SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e1d4b82SJeremy L Thompson // 49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59e1d4b82SJeremy L Thompson // 69e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79e1d4b82SJeremy L Thompson 89e1d4b82SJeremy L Thompson /// @file 99e1d4b82SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation 109e1d4b82SJeremy L Thompson #include <ceed/types.h> 119e1d4b82SJeremy L Thompson 129e1d4b82SJeremy L Thompson #include "cuda-shared-basis-read-write-templates.h" 139e1d4b82SJeremy L Thompson #include "cuda-shared-basis-tensor-at-points-templates.h" 149e1d4b82SJeremy L Thompson #include "cuda-shared-basis-tensor-templates.h" 159e1d4b82SJeremy L Thompson 169e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 179e1d4b82SJeremy L Thompson // Tensor Basis Kernels AtPoints 189e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 199e1d4b82SJeremy L Thompson 209e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 219e1d4b82SJeremy L Thompson // Interp 229e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 239e1d4b82SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, 249e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 259e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 269e1d4b82SJeremy L Thompson 279e1d4b82SJeremy L Thompson SharedData_Cuda data; 289e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 299e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 309e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 319e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 329e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 339e1d4b82SJeremy L Thompson 34b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 359e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 369e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 379e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 389e1d4b82SJeremy L Thompson 39aa4002adSJeremy L Thompson // load interp_1d into shared memory 40aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 41aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 42aa4002adSJeremy L Thompson __syncthreads(); 43aa4002adSJeremy L Thompson 449e1d4b82SJeremy L Thompson // Apply basis element by element 459e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 469e1d4b82SJeremy L Thompson // Map to coefficients 479e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 489e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 49aa4002adSJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 509e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 519e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 52aa4002adSJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 539e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 549e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 559e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 56aa4002adSJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 579e1d4b82SJeremy L Thompson } 589e1d4b82SJeremy L Thompson 599e1d4b82SJeremy L Thompson // Map to points 60b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 61b6a2eb79SJeremy L Thompson 62b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 63b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 64b6a2eb79SJeremy L Thompson 65b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 66b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 67b6a2eb79SJeremy L Thompson InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 68b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 69b6a2eb79SJeremy L Thompson InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 70b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 71b6a2eb79SJeremy L Thompson InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 72b6a2eb79SJeremy L Thompson } 73b6a2eb79SJeremy L Thompson WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 74b6a2eb79SJeremy L Thompson } 759e1d4b82SJeremy L Thompson } 769e1d4b82SJeremy L Thompson } 779e1d4b82SJeremy L Thompson 789e1d4b82SJeremy L Thompson extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 799e1d4b82SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 809e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 819e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 829e1d4b82SJeremy L Thompson 839e1d4b82SJeremy L Thompson SharedData_Cuda data; 849e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 859e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 869e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 879e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 889e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 899e1d4b82SJeremy L Thompson 90b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 919e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 929e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 939e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 949e1d4b82SJeremy L Thompson 95aa4002adSJeremy L Thompson // load interp_1d into shared memory 96aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 97aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 98aa4002adSJeremy L Thompson __syncthreads(); 99aa4002adSJeremy L Thompson 1009e1d4b82SJeremy L Thompson // Apply basis element by element 1019e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 102b6a2eb79SJeremy L Thompson // Clear register 103b6a2eb79SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 104b6a2eb79SJeremy L Thompson 105*af0e6e89SJeremy L Thompson // Clear output vector 106*af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; 107*af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 108*af0e6e89SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 109*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 110*af0e6e89SJeremy L Thompson WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 111*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 112*af0e6e89SJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 113*af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 114*af0e6e89SJeremy L Thompson } 115*af0e6e89SJeremy L Thompson 116*af0e6e89SJeremy L Thompson // Map from points 117*af0e6e89SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 118*af0e6e89SJeremy L Thompson 119*af0e6e89SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 120*af0e6e89SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 121*af0e6e89SJeremy L Thompson 122*af0e6e89SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 123*af0e6e89SJeremy L Thompson ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 124*af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 125*af0e6e89SJeremy L Thompson InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 126*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 127*af0e6e89SJeremy L Thompson InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 128*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 129*af0e6e89SJeremy L Thompson InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 130*af0e6e89SJeremy L Thompson } 131*af0e6e89SJeremy L Thompson } 132*af0e6e89SJeremy L Thompson __syncthreads(); 133*af0e6e89SJeremy L Thompson 134*af0e6e89SJeremy L Thompson // Map from coefficients 135*af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 136*af0e6e89SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 137*af0e6e89SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 138*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 139*af0e6e89SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 140*af0e6e89SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 141*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 142*af0e6e89SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 143*af0e6e89SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 144*af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 145*af0e6e89SJeremy L Thompson } 146*af0e6e89SJeremy L Thompson } 147*af0e6e89SJeremy L Thompson } 148*af0e6e89SJeremy L Thompson 149*af0e6e89SJeremy L Thompson extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 150*af0e6e89SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 151*af0e6e89SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 152*af0e6e89SJeremy L Thompson extern __shared__ CeedScalar slice[]; 153*af0e6e89SJeremy L Thompson 154*af0e6e89SJeremy L Thompson SharedData_Cuda data; 155*af0e6e89SJeremy L Thompson data.t_id_x = threadIdx.x; 156*af0e6e89SJeremy L Thompson data.t_id_y = threadIdx.y; 157*af0e6e89SJeremy L Thompson data.t_id_z = threadIdx.z; 158*af0e6e89SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 159*af0e6e89SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 160*af0e6e89SJeremy L Thompson 161*af0e6e89SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 162*af0e6e89SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 163*af0e6e89SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 164*af0e6e89SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 165*af0e6e89SJeremy L Thompson 166*af0e6e89SJeremy L Thompson // load interp_1d into shared memory 167*af0e6e89SJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 168*af0e6e89SJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 169*af0e6e89SJeremy L Thompson __syncthreads(); 170*af0e6e89SJeremy L Thompson 171*af0e6e89SJeremy L Thompson // Apply basis element by element 172*af0e6e89SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 173*af0e6e89SJeremy L Thompson // Clear register 174*af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 175*af0e6e89SJeremy L Thompson 1769e1d4b82SJeremy L Thompson // Map from points 177b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 178b6a2eb79SJeremy L Thompson 179b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 180b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 181b6a2eb79SJeremy L Thompson 182b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 183b6a2eb79SJeremy L Thompson ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 184b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 185b6a2eb79SJeremy L Thompson InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 186b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 187b6a2eb79SJeremy L Thompson InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 188b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 189b6a2eb79SJeremy L Thompson InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 190b6a2eb79SJeremy L Thompson } 191b6a2eb79SJeremy L Thompson } 192b6a2eb79SJeremy L Thompson __syncthreads(); 1939e1d4b82SJeremy L Thompson 1949e1d4b82SJeremy L Thompson // Map from coefficients 1959e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 196aa4002adSJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 1979e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 1989e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 199aa4002adSJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 2009e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 2019e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 202aa4002adSJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 2039e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 2049e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 2059e1d4b82SJeremy L Thompson } 2069e1d4b82SJeremy L Thompson } 2079e1d4b82SJeremy L Thompson } 2089e1d4b82SJeremy L Thompson 2099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2109e1d4b82SJeremy L Thompson // Grad 2119e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2129e1d4b82SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, 2139e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 2149e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 2159e1d4b82SJeremy L Thompson 2169e1d4b82SJeremy L Thompson SharedData_Cuda data; 2179e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 2189e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 2199e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 2209e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 2219e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 2229e1d4b82SJeremy L Thompson 223b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 2249e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 2259e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 2269e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 2279e1d4b82SJeremy L Thompson 228aa4002adSJeremy L Thompson // load interp_1d into shared memory 229aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 230aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 231aa4002adSJeremy L Thompson __syncthreads(); 232aa4002adSJeremy L Thompson 2339e1d4b82SJeremy L Thompson // Apply basis element by element 2349e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 2359e1d4b82SJeremy L Thompson // Map to coefficients 2369e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 2379e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 238aa4002adSJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 2399e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 2409e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 241aa4002adSJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 2429e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 2439e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 2449e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 245aa4002adSJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 2469e1d4b82SJeremy L Thompson } 2479e1d4b82SJeremy L Thompson 2489e1d4b82SJeremy L Thompson // Map to points 249b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 250b6a2eb79SJeremy L Thompson 251b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 252b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 253b6a2eb79SJeremy L Thompson 254b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 255b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 256b6a2eb79SJeremy L Thompson GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 257b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 258b6a2eb79SJeremy L Thompson GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 259b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 260b6a2eb79SJeremy L Thompson GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 261b6a2eb79SJeremy L Thompson } 262b6a2eb79SJeremy L Thompson WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 263b6a2eb79SJeremy L Thompson } 2649e1d4b82SJeremy L Thompson } 2659e1d4b82SJeremy L Thompson } 2669e1d4b82SJeremy L Thompson 2679e1d4b82SJeremy L Thompson extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 2689e1d4b82SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 2699e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 2709e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 2719e1d4b82SJeremy L Thompson 2729e1d4b82SJeremy L Thompson SharedData_Cuda data; 2739e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 2749e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 2759e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 2769e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 2779e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 2789e1d4b82SJeremy L Thompson 279b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 2809e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 2819e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 2829e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 2839e1d4b82SJeremy L Thompson 284aa4002adSJeremy L Thompson // load interp_1d into shared memory 285aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 286aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 287aa4002adSJeremy L Thompson __syncthreads(); 288aa4002adSJeremy L Thompson 2899e1d4b82SJeremy L Thompson // Apply basis element by element 2909e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 291b6a2eb79SJeremy L Thompson // Clear register 292b6a2eb79SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 293b6a2eb79SJeremy L Thompson 294*af0e6e89SJeremy L Thompson // Clear output vector 295*af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; 296*af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 297*af0e6e89SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 298*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 299*af0e6e89SJeremy L Thompson WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 300*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 301*af0e6e89SJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 302*af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 303*af0e6e89SJeremy L Thompson } 304*af0e6e89SJeremy L Thompson 305*af0e6e89SJeremy L Thompson // Map from points 306*af0e6e89SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 307*af0e6e89SJeremy L Thompson 308*af0e6e89SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 309*af0e6e89SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 310*af0e6e89SJeremy L Thompson 311*af0e6e89SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 312*af0e6e89SJeremy L Thompson ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 313*af0e6e89SJeremy L Thompson r_U); 314*af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 315*af0e6e89SJeremy L Thompson GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 316*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 317*af0e6e89SJeremy L Thompson GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 318*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 319*af0e6e89SJeremy L Thompson GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 320*af0e6e89SJeremy L Thompson } 321*af0e6e89SJeremy L Thompson } 322*af0e6e89SJeremy L Thompson __syncthreads(); 323*af0e6e89SJeremy L Thompson 324*af0e6e89SJeremy L Thompson // Map from coefficients 325*af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 326*af0e6e89SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 327*af0e6e89SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 328*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 329*af0e6e89SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 330*af0e6e89SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 331*af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 332*af0e6e89SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 333*af0e6e89SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 334*af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 335*af0e6e89SJeremy L Thompson } 336*af0e6e89SJeremy L Thompson } 337*af0e6e89SJeremy L Thompson } 338*af0e6e89SJeremy L Thompson 339*af0e6e89SJeremy L Thompson extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 340*af0e6e89SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 341*af0e6e89SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 342*af0e6e89SJeremy L Thompson extern __shared__ CeedScalar slice[]; 343*af0e6e89SJeremy L Thompson 344*af0e6e89SJeremy L Thompson SharedData_Cuda data; 345*af0e6e89SJeremy L Thompson data.t_id_x = threadIdx.x; 346*af0e6e89SJeremy L Thompson data.t_id_y = threadIdx.y; 347*af0e6e89SJeremy L Thompson data.t_id_z = threadIdx.z; 348*af0e6e89SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 349*af0e6e89SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 350*af0e6e89SJeremy L Thompson 351*af0e6e89SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 352*af0e6e89SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 353*af0e6e89SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 354*af0e6e89SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 355*af0e6e89SJeremy L Thompson 356*af0e6e89SJeremy L Thompson // load interp_1d into shared memory 357*af0e6e89SJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 358*af0e6e89SJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 359*af0e6e89SJeremy L Thompson __syncthreads(); 360*af0e6e89SJeremy L Thompson 361*af0e6e89SJeremy L Thompson // Apply basis element by element 362*af0e6e89SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 363*af0e6e89SJeremy L Thompson // Clear register 364*af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 365*af0e6e89SJeremy L Thompson 3669e1d4b82SJeremy L Thompson // Map from points 367b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 368b6a2eb79SJeremy L Thompson 369b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 370b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 371b6a2eb79SJeremy L Thompson 372b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 373b6a2eb79SJeremy L Thompson ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 374b6a2eb79SJeremy L Thompson r_U); 375b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 376b6a2eb79SJeremy L Thompson GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 377b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 378b6a2eb79SJeremy L Thompson GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 379b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 380b6a2eb79SJeremy L Thompson GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 381b6a2eb79SJeremy L Thompson } 382b6a2eb79SJeremy L Thompson } 383b6a2eb79SJeremy L Thompson __syncthreads(); 3849e1d4b82SJeremy L Thompson 3859e1d4b82SJeremy L Thompson // Map from coefficients 3869e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 387aa4002adSJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 3889e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 3899e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 390aa4002adSJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 3919e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 3929e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 393aa4002adSJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 3949e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 3959e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 3969e1d4b82SJeremy L Thompson } 3979e1d4b82SJeremy L Thompson } 3989e1d4b82SJeremy L Thompson } 399