1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP tensor product basis with AtPoints evaluation 10 #include <ceed/types.h> 11 12 #include "hip-shared-basis-read-write-templates.h" 13 #include "hip-shared-basis-tensor-at-points-templates.h" 14 #include "hip-shared-basis-tensor-templates.h" 15 16 //------------------------------------------------------------------------------ 17 // Tensor Basis Kernels AtPoints 18 //------------------------------------------------------------------------------ 19 20 //------------------------------------------------------------------------------ 21 // Interp 22 //------------------------------------------------------------------------------ 23 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 24 void InterpAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 25 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 26 extern __shared__ CeedScalar slice[]; 27 28 SharedData_Hip data; 29 data.t_id_x = threadIdx.x; 30 data.t_id_y = threadIdx.y; 31 data.t_id_z = threadIdx.z; 32 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 33 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 34 35 CeedScalar r_X[BASIS_DIM]; 36 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 37 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 38 CeedScalar r_V[BASIS_NUM_COMP]; 39 40 // load chebyshev_interp_1d into shared memory 41 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 42 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 43 __syncthreads(); 44 45 // Apply basis element by element 46 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 47 // Map to coefficients 48 if (BASIS_DIM == 1) { 49 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 50 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 51 } else if (BASIS_DIM == 2) { 52 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 53 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 54 } else if (BASIS_DIM == 3) { 55 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 56 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 57 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 58 } 59 60 // Map to points 61 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 62 63 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 64 const CeedInt p = i % BASIS_NUM_PTS; 65 66 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 67 if (BASIS_DIM == 1) { 68 InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 69 } else if (BASIS_DIM == 2) { 70 InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 71 } else if (BASIS_DIM == 3) { 72 InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 73 } 74 WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 75 } 76 } 77 } 78 79 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 80 void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 81 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 82 extern __shared__ CeedScalar slice[]; 83 84 SharedData_Hip data; 85 data.t_id_x = threadIdx.x; 86 data.t_id_y = threadIdx.y; 87 data.t_id_z = threadIdx.z; 88 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 89 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 90 91 CeedScalar r_X[BASIS_DIM]; 92 CeedScalar r_U[BASIS_NUM_COMP]; 93 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 94 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 95 96 // load chebyshev_interp_1d into shared memory 97 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 98 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 99 __syncthreads(); 100 101 // Apply basis element by element 102 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 103 // Clear register 104 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 105 106 // Map from points 107 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 108 109 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 110 const CeedInt p = i % BASIS_NUM_PTS; 111 112 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 113 ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 114 if (BASIS_DIM == 1) { 115 InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 116 } else if (BASIS_DIM == 2) { 117 InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 118 } else if (BASIS_DIM == 3) { 119 InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 120 } 121 } 122 __syncthreads(); 123 124 // Map from coefficients 125 if (BASIS_DIM == 1) { 126 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 127 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 128 } else if (BASIS_DIM == 2) { 129 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 130 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 131 } else if (BASIS_DIM == 3) { 132 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 133 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 134 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 135 } 136 } 137 } 138 139 //------------------------------------------------------------------------------ 140 // Grad 141 //------------------------------------------------------------------------------ 142 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 143 void GradAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 144 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 145 extern __shared__ CeedScalar slice[]; 146 147 SharedData_Hip data; 148 data.t_id_x = threadIdx.x; 149 data.t_id_y = threadIdx.y; 150 data.t_id_z = threadIdx.z; 151 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 152 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 153 154 CeedScalar r_X[BASIS_DIM]; 155 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 156 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 157 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 158 159 // load chebyshev_interp_1d into shared memory 160 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 161 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 162 __syncthreads(); 163 164 // Apply basis element by element 165 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 166 // Map to coefficients 167 if (BASIS_DIM == 1) { 168 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 169 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 170 } else if (BASIS_DIM == 2) { 171 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 172 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 173 } else if (BASIS_DIM == 3) { 174 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 175 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 176 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 177 } 178 179 // Map to points 180 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 181 182 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 183 const CeedInt p = i % BASIS_NUM_PTS; 184 185 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 186 if (BASIS_DIM == 1) { 187 GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 188 } else if (BASIS_DIM == 2) { 189 GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 190 } else if (BASIS_DIM == 3) { 191 GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 192 } 193 WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 194 } 195 } 196 } 197 198 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 199 void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 200 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 201 extern __shared__ CeedScalar slice[]; 202 203 SharedData_Hip data; 204 data.t_id_x = threadIdx.x; 205 data.t_id_y = threadIdx.y; 206 data.t_id_z = threadIdx.z; 207 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 208 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 209 210 CeedScalar r_X[BASIS_DIM]; 211 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 212 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 213 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 214 215 // load chebyshev_interp_1d into shared memory 216 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 217 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 218 __syncthreads(); 219 220 // Apply basis element by element 221 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 222 // Clear register 223 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 224 225 // Map from points 226 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 227 228 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 229 const CeedInt p = i % BASIS_NUM_PTS; 230 231 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 232 ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 233 r_U); 234 if (BASIS_DIM == 1) { 235 GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 236 } else if (BASIS_DIM == 2) { 237 GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 238 } else if (BASIS_DIM == 3) { 239 GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 240 } 241 } 242 __syncthreads(); 243 244 // Map from coefficients 245 if (BASIS_DIM == 1) { 246 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 247 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 248 } else if (BASIS_DIM == 2) { 249 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 250 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 251 } else if (BASIS_DIM == 3) { 252 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 253 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 254 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 255 } 256 } 257 } 258