19e1d4b82SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 29e1d4b82SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e1d4b82SJeremy L Thompson // 49e1d4b82SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59e1d4b82SJeremy L Thompson // 69e1d4b82SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79e1d4b82SJeremy L Thompson 89e1d4b82SJeremy L Thompson /// @file 99e1d4b82SJeremy L Thompson /// Internal header for HIP tensor product basis with AtPoints evaluation 109e1d4b82SJeremy L Thompson #include <ceed/types.h> 119e1d4b82SJeremy L Thompson 129e1d4b82SJeremy L Thompson #include "hip-shared-basis-read-write-templates.h" 139e1d4b82SJeremy L Thompson #include "hip-shared-basis-tensor-at-points-templates.h" 149e1d4b82SJeremy L Thompson #include "hip-shared-basis-tensor-templates.h" 159e1d4b82SJeremy L Thompson 169e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 179e1d4b82SJeremy L Thompson // Tensor Basis Kernels AtPoints 189e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 199e1d4b82SJeremy L Thompson 209e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 219e1d4b82SJeremy L Thompson // Interp 229e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 239e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 24*aa4002adSJeremy L Thompson void InterpAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 25*aa4002adSJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 269e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 279e1d4b82SJeremy L Thompson 289e1d4b82SJeremy L Thompson SharedData_Hip data; 299e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 309e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 319e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 329e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 339e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 349e1d4b82SJeremy L Thompson 35b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 369e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 379e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 389e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP]; 399e1d4b82SJeremy L Thompson 40*aa4002adSJeremy L Thompson // load chebyshev_interp_1d into shared memory 41*aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 42*aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 43*aa4002adSJeremy L Thompson __syncthreads(); 44*aa4002adSJeremy L Thompson 459e1d4b82SJeremy L Thompson // Apply basis element by element 469e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 479e1d4b82SJeremy L Thompson // Map to coefficients 489e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 499e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 509e1d4b82SJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 519e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 529e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 539e1d4b82SJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 549e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 559e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 569e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 579e1d4b82SJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 589e1d4b82SJeremy L Thompson } 599e1d4b82SJeremy L Thompson 609e1d4b82SJeremy L Thompson // Map to points 61b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 62b6a2eb79SJeremy L Thompson 63b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 64b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 65b6a2eb79SJeremy L Thompson 66b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 67b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 68b6a2eb79SJeremy L Thompson InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 69b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 70b6a2eb79SJeremy L Thompson InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 71b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 72b6a2eb79SJeremy L Thompson InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 73b6a2eb79SJeremy L Thompson } 74b6a2eb79SJeremy L Thompson WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 75b6a2eb79SJeremy L Thompson } 769e1d4b82SJeremy L Thompson } 779e1d4b82SJeremy L Thompson } 789e1d4b82SJeremy L Thompson 799e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 80*aa4002adSJeremy L Thompson void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 81*aa4002adSJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 829e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 839e1d4b82SJeremy L Thompson 849e1d4b82SJeremy L Thompson SharedData_Hip data; 859e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 869e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 879e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 889e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 899e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 909e1d4b82SJeremy L Thompson 91b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 929e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP]; 939e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 949e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 959e1d4b82SJeremy L Thompson 96*aa4002adSJeremy L Thompson // load chebyshev_interp_1d into shared memory 97*aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 98*aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 99*aa4002adSJeremy L Thompson __syncthreads(); 100*aa4002adSJeremy L Thompson 1019e1d4b82SJeremy L Thompson // Apply basis element by element 1029e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 103b6a2eb79SJeremy L Thompson // Clear register 104b6a2eb79SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 105b6a2eb79SJeremy L Thompson 1069e1d4b82SJeremy L Thompson // Map from points 107b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 108b6a2eb79SJeremy L Thompson 109b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 110b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 111b6a2eb79SJeremy L Thompson 112b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 113b6a2eb79SJeremy L Thompson ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 114b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 115b6a2eb79SJeremy L Thompson InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 116b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 117b6a2eb79SJeremy L Thompson InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 118b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 119b6a2eb79SJeremy L Thompson InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 120b6a2eb79SJeremy L Thompson } 121b6a2eb79SJeremy L Thompson } 122b6a2eb79SJeremy L Thompson __syncthreads(); 1239e1d4b82SJeremy L Thompson 1249e1d4b82SJeremy L Thompson // Map from coefficients 1259e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 1269e1d4b82SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 1279e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 1289e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 1299e1d4b82SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 1309e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 1319e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 1329e1d4b82SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 1339e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 1349e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 1359e1d4b82SJeremy L Thompson } 1369e1d4b82SJeremy L Thompson } 1379e1d4b82SJeremy L Thompson } 1389e1d4b82SJeremy L Thompson 1399e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1409e1d4b82SJeremy L Thompson // Grad 1419e1d4b82SJeremy L Thompson //------------------------------------------------------------------------------ 1429e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 143*aa4002adSJeremy L Thompson void GradAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 144*aa4002adSJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 1459e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 1469e1d4b82SJeremy L Thompson 1479e1d4b82SJeremy L Thompson SharedData_Hip data; 1489e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 1499e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 1509e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 1519e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 1529e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 1539e1d4b82SJeremy L Thompson 154b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 1559e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 1569e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 1579e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 1589e1d4b82SJeremy L Thompson 159*aa4002adSJeremy L Thompson // load chebyshev_interp_1d into shared memory 160*aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 161*aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 162*aa4002adSJeremy L Thompson __syncthreads(); 163*aa4002adSJeremy L Thompson 1649e1d4b82SJeremy L Thompson // Apply basis element by element 1659e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 1669e1d4b82SJeremy L Thompson // Map to coefficients 1679e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 1689e1d4b82SJeremy L Thompson ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 1699e1d4b82SJeremy L Thompson Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 1709e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 1719e1d4b82SJeremy L Thompson ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 1729e1d4b82SJeremy L Thompson InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 1739e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 1749e1d4b82SJeremy L Thompson ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 1759e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 1769e1d4b82SJeremy L Thompson InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 1779e1d4b82SJeremy L Thompson } 1789e1d4b82SJeremy L Thompson 1799e1d4b82SJeremy L Thompson // Map to points 180b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 181b6a2eb79SJeremy L Thompson 182b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 183b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 184b6a2eb79SJeremy L Thompson 185b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 186b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 187b6a2eb79SJeremy L Thompson GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 188b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 189b6a2eb79SJeremy L Thompson GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 190b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 191b6a2eb79SJeremy L Thompson GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 192b6a2eb79SJeremy L Thompson } 193b6a2eb79SJeremy L Thompson WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 194b6a2eb79SJeremy L Thompson } 1959e1d4b82SJeremy L Thompson } 1969e1d4b82SJeremy L Thompson } 1979e1d4b82SJeremy L Thompson 1989e1d4b82SJeremy L Thompson extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 199*aa4002adSJeremy L Thompson void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 200*aa4002adSJeremy L Thompson const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 2019e1d4b82SJeremy L Thompson extern __shared__ CeedScalar slice[]; 2029e1d4b82SJeremy L Thompson 2039e1d4b82SJeremy L Thompson SharedData_Hip data; 2049e1d4b82SJeremy L Thompson data.t_id_x = threadIdx.x; 2059e1d4b82SJeremy L Thompson data.t_id_y = threadIdx.y; 2069e1d4b82SJeremy L Thompson data.t_id_z = threadIdx.z; 2079e1d4b82SJeremy L Thompson data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 2089e1d4b82SJeremy L Thompson data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 2099e1d4b82SJeremy L Thompson 210b6a2eb79SJeremy L Thompson CeedScalar r_X[BASIS_DIM]; 2119e1d4b82SJeremy L Thompson CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 2129e1d4b82SJeremy L Thompson CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 2139e1d4b82SJeremy L Thompson CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 2149e1d4b82SJeremy L Thompson 215*aa4002adSJeremy L Thompson // load chebyshev_interp_1d into shared memory 216*aa4002adSJeremy L Thompson __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 217*aa4002adSJeremy L Thompson LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 218*aa4002adSJeremy L Thompson __syncthreads(); 219*aa4002adSJeremy L Thompson 2209e1d4b82SJeremy L Thompson // Apply basis element by element 2219e1d4b82SJeremy L Thompson for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 222b6a2eb79SJeremy L Thompson // Clear register 223b6a2eb79SJeremy L Thompson for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 224b6a2eb79SJeremy L Thompson 2259e1d4b82SJeremy L Thompson // Map from points 226b6a2eb79SJeremy L Thompson const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 227b6a2eb79SJeremy L Thompson 228b6a2eb79SJeremy L Thompson for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 229b6a2eb79SJeremy L Thompson const CeedInt p = i % BASIS_NUM_PTS; 230b6a2eb79SJeremy L Thompson 231b6a2eb79SJeremy L Thompson ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 232b6a2eb79SJeremy L Thompson ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 233b6a2eb79SJeremy L Thompson r_U); 234b6a2eb79SJeremy L Thompson if (BASIS_DIM == 1) { 235b6a2eb79SJeremy L Thompson GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 236b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 2) { 237b6a2eb79SJeremy L Thompson GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 238b6a2eb79SJeremy L Thompson } else if (BASIS_DIM == 3) { 239b6a2eb79SJeremy L Thompson GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 240b6a2eb79SJeremy L Thompson } 241b6a2eb79SJeremy L Thompson } 242b6a2eb79SJeremy L Thompson __syncthreads(); 2439e1d4b82SJeremy L Thompson 2449e1d4b82SJeremy L Thompson // Map from coefficients 2459e1d4b82SJeremy L Thompson if (BASIS_DIM == 1) { 2469e1d4b82SJeremy L Thompson InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 2479e1d4b82SJeremy L Thompson SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 2489e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 2) { 2499e1d4b82SJeremy L Thompson InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 2509e1d4b82SJeremy L Thompson SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 2519e1d4b82SJeremy L Thompson } else if (BASIS_DIM == 3) { 2529e1d4b82SJeremy L Thompson InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 2539e1d4b82SJeremy L Thompson SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 2549e1d4b82SJeremy L Thompson BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 2559e1d4b82SJeremy L Thompson } 2569e1d4b82SJeremy L Thompson } 2579e1d4b82SJeremy L Thompson } 258