1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis 10 11 #include <ceed.h> 12 13 #include "hip-shared-basis-read-write-templates.h" 14 #include "hip-shared-basis-tensor-templates.h" 15 16 //------------------------------------------------------------------------------ 17 // Interp kernel by dim 18 //------------------------------------------------------------------------------ 19 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 20 void Interp(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 21 extern __shared__ CeedScalar slice[]; 22 23 // load interp_1d into shared memory 24 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 25 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 26 __syncthreads(); 27 28 SharedData_Hip data; 29 data.t_id_x = threadIdx.x; 30 data.t_id_y = threadIdx.y; 31 data.t_id_z = threadIdx.z; 32 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 33 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 34 35 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 36 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 37 38 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 39 if (BASIS_DIM == 1) { 40 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 41 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 42 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); 43 } else if (BASIS_DIM == 2) { 44 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 45 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 46 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 47 } else if (BASIS_DIM == 3) { 48 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 49 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 50 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 51 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 52 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 53 } 54 } 55 } 56 57 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 58 void InterpTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 59 extern __shared__ CeedScalar slice[]; 60 61 // load interp_1d into shared memory 62 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 63 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 64 __syncthreads(); 65 66 SharedData_Hip data; 67 data.t_id_x = threadIdx.x; 68 data.t_id_y = threadIdx.y; 69 data.t_id_z = threadIdx.z; 70 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 71 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 72 73 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 74 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 75 76 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 77 if (BASIS_DIM == 1) { 78 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 79 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 80 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 81 } else if (BASIS_DIM == 2) { 82 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 83 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 84 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 85 } else if (BASIS_DIM == 3) { 86 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 87 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 88 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 89 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 90 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 91 } 92 } 93 } 94 95 //------------------------------------------------------------------------------ 96 // Grad kernel by dim 97 //------------------------------------------------------------------------------ 98 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ 99 void Grad(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, 100 CeedScalar *__restrict__ d_V) { 101 extern __shared__ CeedScalar slice[]; 102 103 // load interp_1d and grad_1d into shared memory 104 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 105 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 106 __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 107 loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G); 108 __syncthreads(); 109 110 SharedData_Hip data; 111 data.t_id_x = threadIdx.x; 112 data.t_id_y = threadIdx.y; 113 data.t_id_z = threadIdx.z; 114 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 115 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 116 117 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 118 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 119 120 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 121 if (BASIS_DIM == 1) { 122 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 123 Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 124 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); 125 } else if (BASIS_DIM == 2) { 126 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 127 GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 128 WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, 129 d_V); 130 } else if (BASIS_DIM == 3) { 131 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 132 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 133 if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 134 else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 135 WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 136 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 137 } 138 } 139 } 140 141 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ 142 void GradTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, 143 CeedScalar *__restrict__ d_V) { 144 extern __shared__ CeedScalar slice[]; 145 146 // load interp_1d and grad_1d into shared memory 147 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 148 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 149 __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 150 loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G); 151 __syncthreads(); 152 153 SharedData_Hip data; 154 data.t_id_x = threadIdx.x; 155 data.t_id_y = threadIdx.y; 156 data.t_id_z = threadIdx.z; 157 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 158 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 159 160 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 161 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 162 163 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 164 if (BASIS_DIM == 1) { 165 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 166 GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 167 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 168 } else if (BASIS_DIM == 2) { 169 ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, 170 r_U); 171 GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 172 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 173 } else if (BASIS_DIM == 3) { 174 ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 175 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 176 if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 177 else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 178 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 179 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 180 } 181 } 182 } 183 184 //------------------------------------------------------------------------------ 185 // Weight kernels by dim 186 //------------------------------------------------------------------------------ 187 extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__ 188 void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) { 189 extern __shared__ CeedScalar slice[]; 190 191 SharedData_Hip data; 192 data.t_id_x = threadIdx.x; 193 data.t_id_y = threadIdx.y; 194 data.t_id_z = threadIdx.z; 195 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 196 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 197 198 CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1]; 199 200 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 201 if (BASIS_DIM == 1) { 202 Weight1d<BASIS_Q_1D>(data, q_weight_1d, r_W); 203 WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W); 204 } else if (BASIS_DIM == 2) { 205 WeightTensor2d<BASIS_Q_1D>(data, q_weight_1d, r_W); 206 WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W); 207 } else if (BASIS_DIM == 3) { 208 WeightTensor3d<BASIS_Q_1D>(data, q_weight_1d, r_W); 209 WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W, 210 d_W); 211 } 212 } 213 } 214