19e1d4b82SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e1d4b82SJeremy L Thompson // 49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59e1d4b82SJeremy L Thompson // 69e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79e1d4b82SJeremy L Thompson 89e1d4b82SJeremy L Thompson /// @file 99e1d4b82SJeremy L Thompson /// Internal header for HIP tensor product basis with AtPoints evaluation 109e1d4b82SJeremy L Thompson #include <ceed/types.h> 119e1d4b82SJeremy L Thompson 129e1d4b82SJeremy L Thompson #include "hip-shared-basis-read-write-templates.h" 139e1d4b82SJeremy L Thompson #include "hip-shared-basis-tensor-at-points-templates.h" 149e1d4b82SJeremy L Thompson #include "hip-shared-basis-tensor-templates.h" 159e1d4b82SJeremy L Thompson 169e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 179e1d4b82SJeremy L Thompson // Tensor Basis Kernels AtPoints 189e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 199e1d4b82SJeremy L Thompson 209e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 219e1d4b82SJeremy L Thompson // Interp 229e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 239e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 24aa4002adSJeremy L Thompson void InterpAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 25aa4002adSJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 269e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 279e1d4b82SJeremy L Thompson 289e1d4b82SJeremy L Thompson SharedData_Hip data; 299e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 309e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 319e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 329e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 33*6b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 349e1d4b82SJeremy L Thompson 35b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 369e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 379e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 389e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 399e1d4b82SJeremy L Thompson 40aa4002adSJeremy L Thompson // load chebyshev_interp_1d into shared memory 41aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 42aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 43aa4002adSJeremy L Thompson __syncthreads(); 44aa4002adSJeremy L Thompson 459e1d4b82SJeremy L Thompson // Apply basis element by element 469e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 479e1d4b82SJeremy L Thompson // Map to coefficients 489e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 499e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 50*6b92dc4bSJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 519e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 529e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 53*6b92dc4bSJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 549e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 559e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 569e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 57*6b92dc4bSJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 589e1d4b82SJeremy L Thompson } 599e1d4b82SJeremy L Thompson 609e1d4b82SJeremy L Thompson // Map to points 61b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 62b6a2eb79SJeremy L Thompson 63b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 64b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 65b6a2eb79SJeremy L Thompson 66b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 67b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 68b6a2eb79SJeremy L Thompson InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 69b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 70b6a2eb79SJeremy L Thompson InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 71b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 72b6a2eb79SJeremy L Thompson InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 73b6a2eb79SJeremy L Thompson } 74b6a2eb79SJeremy L Thompson WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 75b6a2eb79SJeremy L Thompson } 769e1d4b82SJeremy L Thompson } 779e1d4b82SJeremy L Thompson } 789e1d4b82SJeremy L Thompson 799e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 80aa4002adSJeremy L Thompson void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 81aa4002adSJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 829e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 839e1d4b82SJeremy L Thompson 849e1d4b82SJeremy L Thompson SharedData_Hip data; 859e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 869e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 879e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 889e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 89*6b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 909e1d4b82SJeremy L Thompson 91b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 929e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 939e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 949e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 959e1d4b82SJeremy L Thompson 96aa4002adSJeremy L Thompson // load chebyshev_interp_1d into shared memory 97aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 98aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 99aa4002adSJeremy L Thompson __syncthreads(); 100aa4002adSJeremy L Thompson 1019e1d4b82SJeremy L Thompson // Apply basis element by element 1029e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 103b6a2eb79SJeremy L Thompson // Clear register 104b6a2eb79SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 105b6a2eb79SJeremy L Thompson 106af0e6e89SJeremy L Thompson // Clear output vector 107af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; 108af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 109af0e6e89SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 110af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 111af0e6e89SJeremy L Thompson WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 112af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 113af0e6e89SJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 114af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 115af0e6e89SJeremy L Thompson } 116af0e6e89SJeremy L Thompson 117af0e6e89SJeremy L Thompson // Map from points 118af0e6e89SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 119af0e6e89SJeremy L Thompson 120af0e6e89SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 121af0e6e89SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 122af0e6e89SJeremy L Thompson 123af0e6e89SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 124af0e6e89SJeremy L Thompson ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 125af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 126af0e6e89SJeremy L Thompson InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 127af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 128af0e6e89SJeremy L Thompson InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 129af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 130af0e6e89SJeremy L Thompson InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 131af0e6e89SJeremy L Thompson } 132af0e6e89SJeremy L Thompson } 133af0e6e89SJeremy L Thompson __syncthreads(); 134af0e6e89SJeremy L Thompson 135af0e6e89SJeremy L Thompson // Map from coefficients 136af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 137*6b92dc4bSJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 138af0e6e89SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 139af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 140*6b92dc4bSJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 141af0e6e89SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 142af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 143*6b92dc4bSJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 144af0e6e89SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 145af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 146af0e6e89SJeremy L Thompson } 147af0e6e89SJeremy L Thompson } 148af0e6e89SJeremy L Thompson } 149af0e6e89SJeremy L Thompson 150af0e6e89SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 151af0e6e89SJeremy L Thompson void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 152af0e6e89SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 153af0e6e89SJeremy L Thompson extern __shared__ CeedScalar slice[]; 154af0e6e89SJeremy L Thompson 155af0e6e89SJeremy L Thompson SharedData_Hip data; 156af0e6e89SJeremy L Thompson data.t_id_x = threadIdx.x; 157af0e6e89SJeremy L Thompson data.t_id_y = threadIdx.y; 158af0e6e89SJeremy L Thompson data.t_id_z = threadIdx.z; 159af0e6e89SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 160*6b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 161af0e6e89SJeremy L Thompson 162af0e6e89SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 163af0e6e89SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 164af0e6e89SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 165af0e6e89SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 166af0e6e89SJeremy L Thompson 167af0e6e89SJeremy L Thompson // load chebyshev_interp_1d into shared memory 168af0e6e89SJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 169af0e6e89SJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 170af0e6e89SJeremy L Thompson __syncthreads(); 171af0e6e89SJeremy L Thompson 172af0e6e89SJeremy L Thompson // Apply basis element by element 173af0e6e89SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 174af0e6e89SJeremy L Thompson // Clear register 175af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 176af0e6e89SJeremy L Thompson 1779e1d4b82SJeremy L Thompson // Map from points 178b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 179b6a2eb79SJeremy L Thompson 180b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 181b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 182b6a2eb79SJeremy L Thompson 183b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 184b6a2eb79SJeremy L Thompson ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 185b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 186b6a2eb79SJeremy L Thompson InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 187b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 188b6a2eb79SJeremy L Thompson InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 189b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 190b6a2eb79SJeremy L Thompson InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 191b6a2eb79SJeremy L Thompson } 192b6a2eb79SJeremy L Thompson } 193b6a2eb79SJeremy L Thompson __syncthreads(); 1949e1d4b82SJeremy L Thompson 1959e1d4b82SJeremy L Thompson // Map from coefficients 1969e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 197*6b92dc4bSJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 1989e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 1999e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 200*6b92dc4bSJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 2019e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 2029e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 203*6b92dc4bSJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 2049e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 2059e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 2069e1d4b82SJeremy L Thompson } 2079e1d4b82SJeremy L Thompson } 2089e1d4b82SJeremy L Thompson } 2099e1d4b82SJeremy L Thompson 2109e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2119e1d4b82SJeremy L Thompson // Grad 2129e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 2139e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 214aa4002adSJeremy L Thompson void GradAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 215aa4002adSJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 2169e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 2179e1d4b82SJeremy L Thompson 2189e1d4b82SJeremy L Thompson SharedData_Hip data; 2199e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 2209e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 2219e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 2229e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 223*6b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 2249e1d4b82SJeremy L Thompson 225b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 2269e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 2279e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 2289e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 2299e1d4b82SJeremy L Thompson 230aa4002adSJeremy L Thompson // load chebyshev_interp_1d into shared memory 231aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 232aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 233aa4002adSJeremy L Thompson __syncthreads(); 234aa4002adSJeremy L Thompson 2359e1d4b82SJeremy L Thompson // Apply basis element by element 2369e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 2379e1d4b82SJeremy L Thompson // Map to coefficients 2389e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 2399e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 240*6b92dc4bSJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 2419e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 2429e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 243*6b92dc4bSJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 2449e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 2459e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 2469e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 247*6b92dc4bSJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 2489e1d4b82SJeremy L Thompson } 2499e1d4b82SJeremy L Thompson 2509e1d4b82SJeremy L Thompson // Map to points 251b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 252b6a2eb79SJeremy L Thompson 253b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 254b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 255b6a2eb79SJeremy L Thompson 256b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 257b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 258b6a2eb79SJeremy L Thompson GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 259b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 260b6a2eb79SJeremy L Thompson GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 261b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 262b6a2eb79SJeremy L Thompson GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 263b6a2eb79SJeremy L Thompson } 264b6a2eb79SJeremy L Thompson WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 265b6a2eb79SJeremy L Thompson } 2669e1d4b82SJeremy L Thompson } 2679e1d4b82SJeremy L Thompson } 2689e1d4b82SJeremy L Thompson 2699e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 270aa4002adSJeremy L Thompson void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 271aa4002adSJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 2729e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 2739e1d4b82SJeremy L Thompson 2749e1d4b82SJeremy L Thompson SharedData_Hip data; 2759e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 2769e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 2779e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 2789e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 279*6b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 2809e1d4b82SJeremy L Thompson 281b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 2829e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 2839e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 2849e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 2859e1d4b82SJeremy L Thompson 286aa4002adSJeremy L Thompson // load chebyshev_interp_1d into shared memory 287aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 288aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 289aa4002adSJeremy L Thompson __syncthreads(); 290aa4002adSJeremy L Thompson 2919e1d4b82SJeremy L Thompson // Apply basis element by element 2929e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 293b6a2eb79SJeremy L Thompson // Clear register 294b6a2eb79SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 295b6a2eb79SJeremy L Thompson 296af0e6e89SJeremy L Thompson // Clear output vector 297af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; 298af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 299af0e6e89SJeremy L Thompson WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 300af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 301af0e6e89SJeremy L Thompson WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 302af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 303af0e6e89SJeremy L Thompson WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 304af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 305af0e6e89SJeremy L Thompson } 306af0e6e89SJeremy L Thompson 307af0e6e89SJeremy L Thompson // Map from points 308af0e6e89SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 309af0e6e89SJeremy L Thompson 310af0e6e89SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 311af0e6e89SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 312af0e6e89SJeremy L Thompson 313af0e6e89SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 314af0e6e89SJeremy L Thompson ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 315af0e6e89SJeremy L Thompson r_U); 316af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 317af0e6e89SJeremy L Thompson GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 318af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 319af0e6e89SJeremy L Thompson GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 320af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 321af0e6e89SJeremy L Thompson GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 322af0e6e89SJeremy L Thompson } 323af0e6e89SJeremy L Thompson } 324af0e6e89SJeremy L Thompson __syncthreads(); 325af0e6e89SJeremy L Thompson 326af0e6e89SJeremy L Thompson // Map from coefficients 327af0e6e89SJeremy L Thompson if (BASIS_DIM == 1) { 328*6b92dc4bSJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 329af0e6e89SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 330af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 2) { 331*6b92dc4bSJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 332af0e6e89SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 333af0e6e89SJeremy L Thompson } else if (BASIS_DIM == 3) { 334*6b92dc4bSJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 335af0e6e89SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 336af0e6e89SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 337af0e6e89SJeremy L Thompson } 338af0e6e89SJeremy L Thompson } 339af0e6e89SJeremy L Thompson } 340af0e6e89SJeremy L Thompson 341af0e6e89SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 342af0e6e89SJeremy L Thompson void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 343af0e6e89SJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 344af0e6e89SJeremy L Thompson extern __shared__ CeedScalar slice[]; 345af0e6e89SJeremy L Thompson 346af0e6e89SJeremy L Thompson SharedData_Hip data; 347af0e6e89SJeremy L Thompson data.t_id_x = threadIdx.x; 348af0e6e89SJeremy L Thompson data.t_id_y = threadIdx.y; 349af0e6e89SJeremy L Thompson data.t_id_z = threadIdx.z; 350af0e6e89SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 351*6b92dc4bSJeremy L Thompson data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 352af0e6e89SJeremy L Thompson 353af0e6e89SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 354af0e6e89SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 355af0e6e89SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 356af0e6e89SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 357af0e6e89SJeremy L Thompson 358af0e6e89SJeremy L Thompson // load chebyshev_interp_1d into shared memory 359af0e6e89SJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 360af0e6e89SJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 361af0e6e89SJeremy L Thompson __syncthreads(); 362af0e6e89SJeremy L Thompson 363af0e6e89SJeremy L Thompson // Apply basis element by element 364af0e6e89SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 365af0e6e89SJeremy L Thompson // Clear register 366af0e6e89SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 367af0e6e89SJeremy L Thompson 3689e1d4b82SJeremy L Thompson // Map from points 369b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 370b6a2eb79SJeremy L Thompson 371b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 372b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 373b6a2eb79SJeremy L Thompson 374b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 375b6a2eb79SJeremy L Thompson ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 376b6a2eb79SJeremy L Thompson r_U); 377b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 378b6a2eb79SJeremy L Thompson GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 379b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 380b6a2eb79SJeremy L Thompson GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 381b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 382b6a2eb79SJeremy L Thompson GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 383b6a2eb79SJeremy L Thompson } 384b6a2eb79SJeremy L Thompson } 385b6a2eb79SJeremy L Thompson __syncthreads(); 3869e1d4b82SJeremy L Thompson 3879e1d4b82SJeremy L Thompson // Map from coefficients 3889e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 389*6b92dc4bSJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 3909e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 3919e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 392*6b92dc4bSJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 3939e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 3949e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 395*6b92dc4bSJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 3969e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 3979e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 3989e1d4b82SJeremy L Thompson } 3999e1d4b82SJeremy L Thompson } 4009e1d4b82SJeremy L Thompson } 401