1*9e201c85SYohann // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2*9e201c85SYohann // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*9e201c85SYohann // 4*9e201c85SYohann // SPDX-License-Identifier: BSD-2-Clause 5*9e201c85SYohann // 6*9e201c85SYohann // This file is part of CEED: http://github.com/ceed 7*9e201c85SYohann 8*9e201c85SYohann /// @file 9*9e201c85SYohann /// Internal header for HIP shared memory tensor product basis 10*9e201c85SYohann #ifndef _ceed_hip_shared_basis_tensor_h 11*9e201c85SYohann #define _ceed_hip_shared_basis_tensor_h 12*9e201c85SYohann 13*9e201c85SYohann #include <ceed.h> 14*9e201c85SYohann #include "hip-shared-basis-read-write-templates.h" 15*9e201c85SYohann #include "hip-shared-basis-tensor-templates.h" 16*9e201c85SYohann 17*9e201c85SYohann //------------------------------------------------------------------------------ 18*9e201c85SYohann // Interp kernel by dim 19*9e201c85SYohann //------------------------------------------------------------------------------ 20*9e201c85SYohann extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) 21*9e201c85SYohann __global__ void Interp(const CeedInt num_elem, 22*9e201c85SYohann const CeedScalar *d_interp_1d, 23*9e201c85SYohann const CeedScalar *__restrict__ d_U, 24*9e201c85SYohann CeedScalar *__restrict__ d_V) { 25*9e201c85SYohann extern __shared__ CeedScalar slice[]; 26*9e201c85SYohann // load interp_1d into shared memory 27*9e201c85SYohann __shared__ CeedScalar s_B[BASIS_P_1D*BASIS_Q_1D]; 28*9e201c85SYohann loadMatrix<BASIS_P_1D*BASIS_Q_1D>(d_interp_1d, s_B); 29*9e201c85SYohann __syncthreads(); 30*9e201c85SYohann 31*9e201c85SYohann SharedData_Hip data; 32*9e201c85SYohann data.t_id_x = threadIdx.x; 33*9e201c85SYohann data.t_id_y = threadIdx.y; 34*9e201c85SYohann data.t_id_z = threadIdx.z; 35*9e201c85SYohann data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; 36*9e201c85SYohann data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 37*9e201c85SYohann 38*9e201c85SYohann CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 39*9e201c85SYohann CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 40*9e201c85SYohann 41*9e201c85SYohann for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { 42*9e201c85SYohann if (BASIS_DIM == 1) { 43*9e201c85SYohann ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, d_U, r_U); 44*9e201c85SYohann Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 45*9e201c85SYohann WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, r_V, d_V); 46*9e201c85SYohann } else if (BASIS_DIM == 2) { 47*9e201c85SYohann ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, d_U, r_U); 48*9e201c85SYohann InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 49*9e201c85SYohann WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); 50*9e201c85SYohann } else if (BASIS_DIM == 3) { 51*9e201c85SYohann ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, d_U, r_U); 52*9e201c85SYohann InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 53*9e201c85SYohann WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); 54*9e201c85SYohann } 55*9e201c85SYohann } 56*9e201c85SYohann } 57*9e201c85SYohann 58*9e201c85SYohann extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) 59*9e201c85SYohann __global__ void InterpTranspose(const CeedInt num_elem, 60*9e201c85SYohann const CeedScalar *d_interp_1d, 61*9e201c85SYohann const CeedScalar *__restrict__ d_U, 62*9e201c85SYohann CeedScalar *__restrict__ d_V) { 63*9e201c85SYohann extern __shared__ CeedScalar slice[]; 64*9e201c85SYohann // load interp_1d into shared memory 65*9e201c85SYohann __shared__ CeedScalar s_B[BASIS_P_1D*BASIS_Q_1D]; 66*9e201c85SYohann loadMatrix<BASIS_P_1D*BASIS_Q_1D>(d_interp_1d, s_B); 67*9e201c85SYohann __syncthreads(); 68*9e201c85SYohann 69*9e201c85SYohann SharedData_Hip data; 70*9e201c85SYohann data.t_id_x = threadIdx.x; 71*9e201c85SYohann data.t_id_y = threadIdx.y; 72*9e201c85SYohann data.t_id_z = threadIdx.z; 73*9e201c85SYohann data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; 74*9e201c85SYohann data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 75*9e201c85SYohann 76*9e201c85SYohann CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 77*9e201c85SYohann CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 78*9e201c85SYohann 79*9e201c85SYohann for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { 80*9e201c85SYohann if (BASIS_DIM == 1) { 81*9e201c85SYohann ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, d_U, r_U); 82*9e201c85SYohann InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 83*9e201c85SYohann WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, r_V, d_V); 84*9e201c85SYohann } else if (BASIS_DIM == 2) { 85*9e201c85SYohann ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); 86*9e201c85SYohann InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 87*9e201c85SYohann WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, r_V, d_V); 88*9e201c85SYohann } else if (BASIS_DIM == 3) { 89*9e201c85SYohann ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); 90*9e201c85SYohann InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 91*9e201c85SYohann WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, r_V, d_V); 92*9e201c85SYohann } 93*9e201c85SYohann } 94*9e201c85SYohann } 95*9e201c85SYohann 96*9e201c85SYohann //------------------------------------------------------------------------------ 97*9e201c85SYohann // Grad kernel by dim 98*9e201c85SYohann //------------------------------------------------------------------------------ 99*9e201c85SYohann extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) 100*9e201c85SYohann __global__ void Grad(const CeedInt num_elem, 101*9e201c85SYohann const CeedScalar *d_interp_1d, 102*9e201c85SYohann const CeedScalar *d_grad_1d, 103*9e201c85SYohann const CeedScalar *__restrict__ d_U, 104*9e201c85SYohann CeedScalar *__restrict__ d_V) { 105*9e201c85SYohann extern __shared__ CeedScalar slice[]; 106*9e201c85SYohann // load interp_1d and grad_1d into shared memory 107*9e201c85SYohann __shared__ CeedScalar s_B[BASIS_P_1D*BASIS_Q_1D]; 108*9e201c85SYohann loadMatrix<BASIS_P_1D*BASIS_Q_1D>(d_interp_1d, s_B); 109*9e201c85SYohann __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 110*9e201c85SYohann loadMatrix<BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G); 111*9e201c85SYohann __syncthreads(); 112*9e201c85SYohann 113*9e201c85SYohann SharedData_Hip data; 114*9e201c85SYohann data.t_id_x = threadIdx.x; 115*9e201c85SYohann data.t_id_y = threadIdx.y; 116*9e201c85SYohann data.t_id_z = threadIdx.z; 117*9e201c85SYohann data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; 118*9e201c85SYohann data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 119*9e201c85SYohann 120*9e201c85SYohann CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 121*9e201c85SYohann CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 122*9e201c85SYohann 123*9e201c85SYohann for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { 124*9e201c85SYohann if (BASIS_DIM == 1) { 125*9e201c85SYohann ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, d_U, r_U); 126*9e201c85SYohann Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 127*9e201c85SYohann WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, r_V, d_V); 128*9e201c85SYohann } else if (BASIS_DIM == 2) { 129*9e201c85SYohann ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, d_U, r_U); 130*9e201c85SYohann GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 131*9e201c85SYohann WriteElementStrided2d<BASIS_NUM_COMP*BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); 132*9e201c85SYohann } else if (BASIS_DIM == 3) { 133*9e201c85SYohann ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, d_U, r_U); 134*9e201c85SYohann if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 135*9e201c85SYohann else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 136*9e201c85SYohann WriteElementStrided3d<BASIS_NUM_COMP*BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, r_V, d_V); 137*9e201c85SYohann } 138*9e201c85SYohann } 139*9e201c85SYohann } 140*9e201c85SYohann 141*9e201c85SYohann extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) 142*9e201c85SYohann __global__ void GradTranspose(const CeedInt num_elem, 143*9e201c85SYohann const CeedScalar *d_interp_1d, 144*9e201c85SYohann const CeedScalar *d_grad_1d, 145*9e201c85SYohann const CeedScalar *__restrict__ d_U, 146*9e201c85SYohann CeedScalar *__restrict__ d_V) { 147*9e201c85SYohann extern __shared__ CeedScalar slice[]; 148*9e201c85SYohann // load interp_1d and grad_1d into shared memory 149*9e201c85SYohann __shared__ CeedScalar s_B[BASIS_P_1D*BASIS_Q_1D]; 150*9e201c85SYohann loadMatrix<BASIS_P_1D*BASIS_Q_1D>(d_interp_1d, s_B); 151*9e201c85SYohann __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 152*9e201c85SYohann loadMatrix<BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G); 153*9e201c85SYohann __syncthreads(); 154*9e201c85SYohann 155*9e201c85SYohann SharedData_Hip data; 156*9e201c85SYohann data.t_id_x = threadIdx.x; 157*9e201c85SYohann data.t_id_y = threadIdx.y; 158*9e201c85SYohann data.t_id_z = threadIdx.z; 159*9e201c85SYohann data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; 160*9e201c85SYohann data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 161*9e201c85SYohann 162*9e201c85SYohann CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 163*9e201c85SYohann CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 164*9e201c85SYohann 165*9e201c85SYohann for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { 166*9e201c85SYohann if (BASIS_DIM == 1) { 167*9e201c85SYohann ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, d_U, r_U); 168*9e201c85SYohann GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 169*9e201c85SYohann WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*num_elem, BASIS_P_1D, r_V, d_V); 170*9e201c85SYohann } else if (BASIS_DIM == 2) { 171*9e201c85SYohann ReadElementStrided2d<BASIS_NUM_COMP*BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); 172*9e201c85SYohann GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 173*9e201c85SYohann WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D, r_V, d_V); 174*9e201c85SYohann } else if (BASIS_DIM == 3) { 175*9e201c85SYohann ReadElementStrided3d<BASIS_NUM_COMP*BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, d_U, r_U); 176*9e201c85SYohann if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 177*9e201c85SYohann else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 178*9e201c85SYohann WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D*num_elem, BASIS_P_1D*BASIS_P_1D*BASIS_P_1D, r_V, d_V); 179*9e201c85SYohann } 180*9e201c85SYohann } 181*9e201c85SYohann } 182*9e201c85SYohann 183*9e201c85SYohann //------------------------------------------------------------------------------ 184*9e201c85SYohann // Weight kernels by dim 185*9e201c85SYohann //------------------------------------------------------------------------------ 186*9e201c85SYohann extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) 187*9e201c85SYohann __global__ void Weight(const CeedInt num_elem, 188*9e201c85SYohann const CeedScalar *__restrict__ q_weight_1d, 189*9e201c85SYohann CeedScalar *__restrict__ d_W) { 190*9e201c85SYohann extern __shared__ CeedScalar slice[]; 191*9e201c85SYohann 192*9e201c85SYohann SharedData_Hip data; 193*9e201c85SYohann data.t_id_x = threadIdx.x; 194*9e201c85SYohann data.t_id_y = threadIdx.y; 195*9e201c85SYohann data.t_id_z = threadIdx.z; 196*9e201c85SYohann data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x; 197*9e201c85SYohann data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 198*9e201c85SYohann 199*9e201c85SYohann CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1]; 200*9e201c85SYohann 201*9e201c85SYohann for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) { 202*9e201c85SYohann if (BASIS_DIM == 1) { 203*9e201c85SYohann Weight1d<BASIS_Q_1D>(data, q_weight_1d, r_W); 204*9e201c85SYohann WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*num_elem, BASIS_Q_1D, r_W, d_W); 205*9e201c85SYohann } else if (BASIS_DIM == 2) { 206*9e201c85SYohann WeightTensor2d<BASIS_Q_1D>(data, q_weight_1d, r_W); 207*9e201c85SYohann WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D, r_W, d_W); 208*9e201c85SYohann } else if (BASIS_DIM == 3) { 209*9e201c85SYohann WeightTensor3d<BASIS_Q_1D>(data, q_weight_1d, r_W); 210*9e201c85SYohann WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D*num_elem, BASIS_Q_1D*BASIS_Q_1D*BASIS_Q_1D, r_W, d_W); 211*9e201c85SYohann } 212*9e201c85SYohann } 213*9e201c85SYohann } 214*9e201c85SYohann 215*9e201c85SYohann //------------------------------------------------------------------------------ 216*9e201c85SYohann 217*9e201c85SYohann #endif 218