1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e1d4b82SJeremy L Thompson // 49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59e1d4b82SJeremy L Thompson // 69e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79e1d4b82SJeremy L Thompson 89e1d4b82SJeremy L Thompson /// @file 99e1d4b82SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation 109e1d4b82SJeremy L Thompson #include <ceed/types.h> 119e1d4b82SJeremy L Thompson 129e1d4b82SJeremy L Thompson #include "cuda-shared-basis-read-write-templates.h" 139e1d4b82SJeremy L Thompson #include "cuda-shared-basis-tensor-at-points-templates.h" 149e1d4b82SJeremy L Thompson #include "cuda-shared-basis-tensor-templates.h" 159e1d4b82SJeremy L Thompson 169e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 179e1d4b82SJeremy L Thompson // Tensor Basis Kernels AtPoints 189e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 199e1d4b82SJeremy L Thompson 209e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 219e1d4b82SJeremy L Thompson // Interp 229e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 239e1d4b82SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, 249e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 259e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 269e1d4b82SJeremy L Thompson 279e1d4b82SJeremy L Thompson SharedData_Cuda data; 289e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 299e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 309e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 319e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 3299421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 339e1d4b82SJeremy L Thompson 34b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 359e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 369e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 379e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 389e1d4b82SJeremy L Thompson 39aa4002adSJeremy L Thompson // load interp_1d into shared memory 40aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 41aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 42aa4002adSJeremy L Thompson __syncthreads(); 43aa4002adSJeremy L Thompson 449e1d4b82SJeremy L Thompson // Apply basis element by element 459e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 469e1d4b82SJeremy L Thompson // Map to coefficients 479e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 489e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 4999421279SJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 509e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 519e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 5299421279SJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 539e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 549e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 559e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 5699421279SJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 579e1d4b82SJeremy L Thompson } 589e1d4b82SJeremy L Thompson 599e1d4b82SJeremy L Thompson // Map to points 60b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 61b6a2eb79SJeremy L Thompson 62b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 63b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 64b6a2eb79SJeremy L Thompson 65b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 66b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 67f725b54bSJeremy L Thompson InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 68b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 69f725b54bSJeremy L Thompson InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 70b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 71f725b54bSJeremy L Thompson InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 72b6a2eb79SJeremy L Thompson } 73b6a2eb79SJeremy L Thompson WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 74b6a2eb79SJeremy L Thompson } 759e1d4b82SJeremy L Thompson } 769e1d4b82SJeremy L Thompson } 779e1d4b82SJeremy L Thompson 789e1d4b82SJeremy L Thompson extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 799e1d4b82SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 809e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 819e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 829e1d4b82SJeremy L Thompson 839e1d4b82SJeremy L Thompson SharedData_Cuda data; 849e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 859e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 869e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 879e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 8899421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 899e1d4b82SJeremy L Thompson 90b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 919e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 929e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 939e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 949e1d4b82SJeremy L Thompson 95aa4002adSJeremy L Thompson // load interp_1d into shared memory 96aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 97aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 98aa4002adSJeremy L Thompson __syncthreads(); 99aa4002adSJeremy L Thompson 1009e1d4b82SJeremy L Thompson // Apply basis element by element 1019e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 102b6a2eb79SJeremy L Thompson // Clear register 103b6a2eb79SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 104b6a2eb79SJeremy L Thompson 105af0e6e89SJeremy L Thompson // Clear output vector 106af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; 107af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 108af0e6e89SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 109af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 110af0e6e89SJeremy L Thompson WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 111af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 112af0e6e89SJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 113af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 114af0e6e89SJeremy L Thompson } 115af0e6e89SJeremy L Thompson 116af0e6e89SJeremy L Thompson // Map from points 117af0e6e89SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 118af0e6e89SJeremy L Thompson 119af0e6e89SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 120af0e6e89SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 121af0e6e89SJeremy L Thompson 122af0e6e89SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 123af0e6e89SJeremy L Thompson ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 124af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 125f725b54bSJeremy L Thompson InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 126af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 127f725b54bSJeremy L Thompson InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 128af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 129f725b54bSJeremy L Thompson InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 130af0e6e89SJeremy L Thompson } 131af0e6e89SJeremy L Thompson } 132af0e6e89SJeremy L Thompson 133af0e6e89SJeremy L Thompson // Map from coefficients 134af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 13599421279SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 136af0e6e89SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 137af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 13899421279SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 139af0e6e89SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 140af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 14199421279SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 142af0e6e89SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 143af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 144af0e6e89SJeremy L Thompson } 145af0e6e89SJeremy L Thompson } 146af0e6e89SJeremy L Thompson } 147af0e6e89SJeremy L Thompson 148af0e6e89SJeremy L Thompson extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 149af0e6e89SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 150af0e6e89SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 151af0e6e89SJeremy L Thompson extern __shared__ CeedScalar slice[]; 152af0e6e89SJeremy L Thompson 153af0e6e89SJeremy L Thompson SharedData_Cuda data; 154af0e6e89SJeremy L Thompson data.t_id_x = threadIdx.x; 155af0e6e89SJeremy L Thompson data.t_id_y = threadIdx.y; 156af0e6e89SJeremy L Thompson data.t_id_z = threadIdx.z; 157af0e6e89SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 15899421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 159af0e6e89SJeremy L Thompson 160af0e6e89SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 161af0e6e89SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 162af0e6e89SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 163af0e6e89SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 164af0e6e89SJeremy L Thompson 165af0e6e89SJeremy L Thompson // load interp_1d into shared memory 166af0e6e89SJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 167af0e6e89SJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 168af0e6e89SJeremy L Thompson __syncthreads(); 169af0e6e89SJeremy L Thompson 170af0e6e89SJeremy L Thompson // Apply basis element by element 171af0e6e89SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 172af0e6e89SJeremy L Thompson // Clear register 173af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 174af0e6e89SJeremy L Thompson 1759e1d4b82SJeremy L Thompson // Map from points 176b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 177b6a2eb79SJeremy L Thompson 178b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 179b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 180b6a2eb79SJeremy L Thompson 181b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 182b6a2eb79SJeremy L Thompson ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 183b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 184f725b54bSJeremy L Thompson InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 185b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 186f725b54bSJeremy L Thompson InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 187b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 188f725b54bSJeremy L Thompson InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 189b6a2eb79SJeremy L Thompson } 190b6a2eb79SJeremy L Thompson } 1919e1d4b82SJeremy L Thompson 1929e1d4b82SJeremy L Thompson // Map from coefficients 1939e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 19499421279SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 1959e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 1969e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 19799421279SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 1989e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 1999e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 20099421279SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 2019e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 2029e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 2039e1d4b82SJeremy L Thompson } 2049e1d4b82SJeremy L Thompson } 2059e1d4b82SJeremy L Thompson } 2069e1d4b82SJeremy L Thompson 2079e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2089e1d4b82SJeremy L Thompson // Grad 2099e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2109e1d4b82SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, 2119e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 2129e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 2139e1d4b82SJeremy L Thompson 2149e1d4b82SJeremy L Thompson SharedData_Cuda data; 2159e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 2169e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 2179e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 2189e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 21999421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 2209e1d4b82SJeremy L Thompson 221b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 2229e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 2239e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 2249e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 2259e1d4b82SJeremy L Thompson 226aa4002adSJeremy L Thompson // load interp_1d into shared memory 227aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 228aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 229aa4002adSJeremy L Thompson __syncthreads(); 230aa4002adSJeremy L Thompson 2319e1d4b82SJeremy L Thompson // Apply basis element by element 2329e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 2339e1d4b82SJeremy L Thompson // Map to coefficients 2349e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 2359e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 23699421279SJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 2379e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 2389e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 23999421279SJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 2409e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 2419e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 2429e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 24399421279SJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 2449e1d4b82SJeremy L Thompson } 2459e1d4b82SJeremy L Thompson 2469e1d4b82SJeremy L Thompson // Map to points 247b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 248b6a2eb79SJeremy L Thompson 249b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 250b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 251b6a2eb79SJeremy L Thompson 252b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 253b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 254f725b54bSJeremy L Thompson GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 255b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 256f725b54bSJeremy L Thompson GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 257b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 258f725b54bSJeremy L Thompson GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 259b6a2eb79SJeremy L Thompson } 260b6a2eb79SJeremy L Thompson WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 261b6a2eb79SJeremy L Thompson } 2629e1d4b82SJeremy L Thompson } 2639e1d4b82SJeremy L Thompson } 2649e1d4b82SJeremy L Thompson 2659e1d4b82SJeremy L Thompson extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 2669e1d4b82SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 2679e1d4b82SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 2689e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 2699e1d4b82SJeremy L Thompson 2709e1d4b82SJeremy L Thompson SharedData_Cuda data; 2719e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 2729e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 2739e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 2749e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 27599421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 2769e1d4b82SJeremy L Thompson 277b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 2789e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 2799e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 2809e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 2819e1d4b82SJeremy L Thompson 282aa4002adSJeremy L Thompson // load interp_1d into shared memory 283aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 284aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 285aa4002adSJeremy L Thompson __syncthreads(); 286aa4002adSJeremy L Thompson 2879e1d4b82SJeremy L Thompson // Apply basis element by element 2889e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 289b6a2eb79SJeremy L Thompson // Clear register 290b6a2eb79SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 291b6a2eb79SJeremy L Thompson 292af0e6e89SJeremy L Thompson // Clear output vector 293af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; 294af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 295af0e6e89SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 296af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 297af0e6e89SJeremy L Thompson WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 298af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 299af0e6e89SJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 300af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 301af0e6e89SJeremy L Thompson } 302af0e6e89SJeremy L Thompson 303af0e6e89SJeremy L Thompson // Map from points 304af0e6e89SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 305af0e6e89SJeremy L Thompson 306af0e6e89SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 307af0e6e89SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 308af0e6e89SJeremy L Thompson 309af0e6e89SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 310af0e6e89SJeremy L Thompson ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 311af0e6e89SJeremy L Thompson r_U); 312af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 313f725b54bSJeremy L Thompson GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 314af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 315f725b54bSJeremy L Thompson GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 316af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 317f725b54bSJeremy L Thompson GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 318af0e6e89SJeremy L Thompson } 319af0e6e89SJeremy L Thompson } 320af0e6e89SJeremy L Thompson 321af0e6e89SJeremy L Thompson // Map from coefficients 322af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 32399421279SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 324af0e6e89SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 325af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 32699421279SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 327af0e6e89SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 328af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 32999421279SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 330af0e6e89SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 331af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 332af0e6e89SJeremy L Thompson } 333af0e6e89SJeremy L Thompson } 334af0e6e89SJeremy L Thompson } 335af0e6e89SJeremy L Thompson 336af0e6e89SJeremy L Thompson extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 337af0e6e89SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 338af0e6e89SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 339af0e6e89SJeremy L Thompson extern __shared__ CeedScalar slice[]; 340af0e6e89SJeremy L Thompson 341af0e6e89SJeremy L Thompson SharedData_Cuda data; 342af0e6e89SJeremy L Thompson data.t_id_x = threadIdx.x; 343af0e6e89SJeremy L Thompson data.t_id_y = threadIdx.y; 344af0e6e89SJeremy L Thompson data.t_id_z = threadIdx.z; 345af0e6e89SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 34699421279SJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 347af0e6e89SJeremy L Thompson 348af0e6e89SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 349af0e6e89SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 350af0e6e89SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 351af0e6e89SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 352af0e6e89SJeremy L Thompson 353af0e6e89SJeremy L Thompson // load interp_1d into shared memory 354af0e6e89SJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 355af0e6e89SJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 356af0e6e89SJeremy L Thompson __syncthreads(); 357af0e6e89SJeremy L Thompson 358af0e6e89SJeremy L Thompson // Apply basis element by element 359af0e6e89SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 360af0e6e89SJeremy L Thompson // Clear register 361af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 362af0e6e89SJeremy L Thompson 3639e1d4b82SJeremy L Thompson // Map from points 364b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 365b6a2eb79SJeremy L Thompson 366b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 367b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 368b6a2eb79SJeremy L Thompson 369b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 370b6a2eb79SJeremy L Thompson ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 371b6a2eb79SJeremy L Thompson r_U); 372b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 373f725b54bSJeremy L Thompson GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 374b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 375f725b54bSJeremy L Thompson GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 376b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 377f725b54bSJeremy L Thompson GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 378b6a2eb79SJeremy L Thompson } 379b6a2eb79SJeremy L Thompson } 3809e1d4b82SJeremy L Thompson 3819e1d4b82SJeremy L Thompson // Map from coefficients 3829e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 38399421279SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 3849e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 3859e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 38699421279SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 3879e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 3889e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 38999421279SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 3909e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 3919e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 3929e1d4b82SJeremy L Thompson } 3939e1d4b82SJeremy L Thompson } 3949e1d4b82SJeremy L Thompson } 395