1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis 10 #ifndef _ceed_hip_shared_basis_tensor_h 11 #define _ceed_hip_shared_basis_tensor_h 12 13 #include <ceed.h> 14 15 #include "hip-shared-basis-read-write-templates.h" 16 #include "hip-shared-basis-tensor-templates.h" 17 18 //------------------------------------------------------------------------------ 19 // Interp kernel by dim 20 //------------------------------------------------------------------------------ 21 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 22 void Interp(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 23 extern __shared__ CeedScalar slice[]; 24 // load interp_1d into shared memory 25 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 26 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 27 __syncthreads(); 28 29 SharedData_Hip data; 30 data.t_id_x = threadIdx.x; 31 data.t_id_y = threadIdx.y; 32 data.t_id_z = threadIdx.z; 33 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 34 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 35 36 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 37 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 38 39 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 40 if (BASIS_DIM == 1) { 41 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 42 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 43 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); 44 } else if (BASIS_DIM == 2) { 45 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 46 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 47 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 48 } else if (BASIS_DIM == 3) { 49 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 50 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 51 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 52 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 53 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 54 } 55 } 56 } 57 58 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 59 void InterpTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 60 extern __shared__ CeedScalar slice[]; 61 // load interp_1d into shared memory 62 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 63 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 64 __syncthreads(); 65 66 SharedData_Hip data; 67 data.t_id_x = threadIdx.x; 68 data.t_id_y = threadIdx.y; 69 data.t_id_z = threadIdx.z; 70 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 71 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 72 73 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 74 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 75 76 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 77 if (BASIS_DIM == 1) { 78 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 79 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 80 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 81 } else if (BASIS_DIM == 2) { 82 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 83 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 84 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 85 } else if (BASIS_DIM == 3) { 86 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 87 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 88 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 89 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 90 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 91 } 92 } 93 } 94 95 //------------------------------------------------------------------------------ 96 // Grad kernel by dim 97 //------------------------------------------------------------------------------ 98 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ 99 void Grad(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, 100 CeedScalar *__restrict__ d_V) { 101 extern __shared__ CeedScalar slice[]; 102 // load interp_1d and grad_1d into shared memory 103 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 104 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 105 __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 106 loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G); 107 __syncthreads(); 108 109 SharedData_Hip data; 110 data.t_id_x = threadIdx.x; 111 data.t_id_y = threadIdx.y; 112 data.t_id_z = threadIdx.z; 113 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 114 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 115 116 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 117 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 118 119 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 120 if (BASIS_DIM == 1) { 121 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 122 Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 123 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); 124 } else if (BASIS_DIM == 2) { 125 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 126 GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 127 WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, 128 d_V); 129 } else if (BASIS_DIM == 3) { 130 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 131 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 132 if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 133 else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 134 WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 135 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 136 } 137 } 138 } 139 140 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ 141 void GradTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, 142 CeedScalar *__restrict__ d_V) { 143 extern __shared__ CeedScalar slice[]; 144 // load interp_1d and grad_1d into shared memory 145 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 146 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 147 __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 148 loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G); 149 __syncthreads(); 150 151 SharedData_Hip data; 152 data.t_id_x = threadIdx.x; 153 data.t_id_y = threadIdx.y; 154 data.t_id_z = threadIdx.z; 155 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 156 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 157 158 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 159 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 160 161 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 162 if (BASIS_DIM == 1) { 163 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 164 GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 165 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 166 } else if (BASIS_DIM == 2) { 167 ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, 168 r_U); 169 GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 170 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 171 } else if (BASIS_DIM == 3) { 172 ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 173 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 174 if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 175 else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 176 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 177 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 178 } 179 } 180 } 181 182 //------------------------------------------------------------------------------ 183 // Weight kernels by dim 184 //------------------------------------------------------------------------------ 185 extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__ 186 void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) { 187 extern __shared__ CeedScalar slice[]; 188 189 SharedData_Hip data; 190 data.t_id_x = threadIdx.x; 191 data.t_id_y = threadIdx.y; 192 data.t_id_z = threadIdx.z; 193 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 194 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 195 196 CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1]; 197 198 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 199 if (BASIS_DIM == 1) { 200 Weight1d<BASIS_Q_1D>(data, q_weight_1d, r_W); 201 WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W); 202 } else if (BASIS_DIM == 2) { 203 WeightTensor2d<BASIS_Q_1D>(data, q_weight_1d, r_W); 204 WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W); 205 } else if (BASIS_DIM == 3) { 206 WeightTensor3d<BASIS_Q_1D>(data, q_weight_1d, r_W); 207 WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W, 208 d_W); 209 } 210 } 211 } 212 213 //------------------------------------------------------------------------------ 214 215 #endif 216