1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP tensor product basis with AtPoints evaluation 10 #include <ceed/types.h> 11 12 #include "hip-shared-basis-read-write-templates.h" 13 #include "hip-shared-basis-tensor-at-points-templates.h" 14 #include "hip-shared-basis-tensor-templates.h" 15 16 //------------------------------------------------------------------------------ 17 // Tensor Basis Kernels AtPoints 18 //------------------------------------------------------------------------------ 19 20 //------------------------------------------------------------------------------ 21 // Interp 22 //------------------------------------------------------------------------------ 23 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 24 void InterpAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 25 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 26 extern __shared__ CeedScalar slice[]; 27 28 SharedData_Hip data; 29 data.t_id_x = threadIdx.x; 30 data.t_id_y = threadIdx.y; 31 data.t_id_z = threadIdx.z; 32 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 33 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 34 35 CeedScalar r_X[BASIS_DIM]; 36 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 37 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 38 CeedScalar r_V[BASIS_NUM_COMP]; 39 40 // load chebyshev_interp_1d into shared memory 41 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 42 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 43 __syncthreads(); 44 45 // Apply basis element by element 46 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 47 // Map to coefficients 48 if (BASIS_DIM == 1) { 49 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 50 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 51 } else if (BASIS_DIM == 2) { 52 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 53 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 54 } else if (BASIS_DIM == 3) { 55 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 56 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 57 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 58 } 59 60 // Map to points 61 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 62 63 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 64 const CeedInt p = i % BASIS_NUM_PTS; 65 66 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 67 if (BASIS_DIM == 1) { 68 InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 69 } else if (BASIS_DIM == 2) { 70 InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 71 } else if (BASIS_DIM == 3) { 72 InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 73 } 74 WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 75 } 76 } 77 } 78 79 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 80 void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 81 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 82 extern __shared__ CeedScalar slice[]; 83 84 SharedData_Hip data; 85 data.t_id_x = threadIdx.x; 86 data.t_id_y = threadIdx.y; 87 data.t_id_z = threadIdx.z; 88 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 89 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 90 91 CeedScalar r_X[BASIS_DIM]; 92 CeedScalar r_U[BASIS_NUM_COMP]; 93 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 94 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 95 96 // load chebyshev_interp_1d into shared memory 97 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 98 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 99 __syncthreads(); 100 101 // Apply basis element by element 102 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 103 // Clear register 104 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 105 106 // Clear output vector 107 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; 108 if (BASIS_DIM == 1) { 109 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 110 } else if (BASIS_DIM == 2) { 111 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 112 } else if (BASIS_DIM == 3) { 113 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 114 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 115 } 116 117 // Map from points 118 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 119 120 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 121 const CeedInt p = i % BASIS_NUM_PTS; 122 123 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 124 ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 125 if (BASIS_DIM == 1) { 126 InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 127 } else if (BASIS_DIM == 2) { 128 InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 129 } else if (BASIS_DIM == 3) { 130 InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 131 } 132 } 133 __syncthreads(); 134 135 // Map from coefficients 136 if (BASIS_DIM == 1) { 137 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 138 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 139 } else if (BASIS_DIM == 2) { 140 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 141 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 142 } else if (BASIS_DIM == 3) { 143 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 144 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 145 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 146 } 147 } 148 } 149 150 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 151 void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 152 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 153 extern __shared__ CeedScalar slice[]; 154 155 SharedData_Hip data; 156 data.t_id_x = threadIdx.x; 157 data.t_id_y = threadIdx.y; 158 data.t_id_z = threadIdx.z; 159 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 160 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 161 162 CeedScalar r_X[BASIS_DIM]; 163 CeedScalar r_U[BASIS_NUM_COMP]; 164 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 165 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 166 167 // load chebyshev_interp_1d into shared memory 168 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 169 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 170 __syncthreads(); 171 172 // Apply basis element by element 173 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 174 // Clear register 175 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 176 177 // Map from points 178 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 179 180 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 181 const CeedInt p = i % BASIS_NUM_PTS; 182 183 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 184 ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 185 if (BASIS_DIM == 1) { 186 InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 187 } else if (BASIS_DIM == 2) { 188 InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 189 } else if (BASIS_DIM == 3) { 190 InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 191 } 192 } 193 __syncthreads(); 194 195 // Map from coefficients 196 if (BASIS_DIM == 1) { 197 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 198 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 199 } else if (BASIS_DIM == 2) { 200 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 201 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 202 } else if (BASIS_DIM == 3) { 203 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 204 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 205 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 206 } 207 } 208 } 209 210 //------------------------------------------------------------------------------ 211 // Grad 212 //------------------------------------------------------------------------------ 213 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 214 void GradAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 215 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 216 extern __shared__ CeedScalar slice[]; 217 218 SharedData_Hip data; 219 data.t_id_x = threadIdx.x; 220 data.t_id_y = threadIdx.y; 221 data.t_id_z = threadIdx.z; 222 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 223 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 224 225 CeedScalar r_X[BASIS_DIM]; 226 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 227 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 228 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 229 230 // load chebyshev_interp_1d into shared memory 231 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 232 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 233 __syncthreads(); 234 235 // Apply basis element by element 236 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 237 // Map to coefficients 238 if (BASIS_DIM == 1) { 239 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 240 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 241 } else if (BASIS_DIM == 2) { 242 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 243 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 244 } else if (BASIS_DIM == 3) { 245 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 246 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 247 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 248 } 249 250 // Map to points 251 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 252 253 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 254 const CeedInt p = i % BASIS_NUM_PTS; 255 256 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 257 if (BASIS_DIM == 1) { 258 GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 259 } else if (BASIS_DIM == 2) { 260 GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 261 } else if (BASIS_DIM == 3) { 262 GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 263 } 264 WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 265 } 266 } 267 } 268 269 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 270 void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 271 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 272 extern __shared__ CeedScalar slice[]; 273 274 SharedData_Hip data; 275 data.t_id_x = threadIdx.x; 276 data.t_id_y = threadIdx.y; 277 data.t_id_z = threadIdx.z; 278 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 279 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 280 281 CeedScalar r_X[BASIS_DIM]; 282 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 283 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 284 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 285 286 // load chebyshev_interp_1d into shared memory 287 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 288 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 289 __syncthreads(); 290 291 // Apply basis element by element 292 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 293 // Clear register 294 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 295 296 // Clear output vector 297 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; 298 if (BASIS_DIM == 1) { 299 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 300 } else if (BASIS_DIM == 2) { 301 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 302 } else if (BASIS_DIM == 3) { 303 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 304 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 305 } 306 307 // Map from points 308 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 309 310 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 311 const CeedInt p = i % BASIS_NUM_PTS; 312 313 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 314 ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 315 r_U); 316 if (BASIS_DIM == 1) { 317 GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 318 } else if (BASIS_DIM == 2) { 319 GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 320 } else if (BASIS_DIM == 3) { 321 GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 322 } 323 } 324 __syncthreads(); 325 326 // Map from coefficients 327 if (BASIS_DIM == 1) { 328 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 329 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 330 } else if (BASIS_DIM == 2) { 331 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 332 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 333 } else if (BASIS_DIM == 3) { 334 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 335 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 336 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 337 } 338 } 339 } 340 341 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 342 void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *c_B, const CeedInt *points_per_elem, const CeedScalar *__restrict__ d_X, 343 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 344 extern __shared__ CeedScalar slice[]; 345 346 SharedData_Hip data; 347 data.t_id_x = threadIdx.x; 348 data.t_id_y = threadIdx.y; 349 data.t_id_z = threadIdx.z; 350 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 351 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 352 353 CeedScalar r_X[BASIS_DIM]; 354 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 355 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 356 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 357 358 // load chebyshev_interp_1d into shared memory 359 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 360 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 361 __syncthreads(); 362 363 // Apply basis element by element 364 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 365 // Clear register 366 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 367 368 // Map from points 369 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 370 371 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 372 const CeedInt p = i % BASIS_NUM_PTS; 373 374 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 375 ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 376 r_U); 377 if (BASIS_DIM == 1) { 378 GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 379 } else if (BASIS_DIM == 2) { 380 GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 381 } else if (BASIS_DIM == 3) { 382 GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 383 } 384 } 385 __syncthreads(); 386 387 // Map from coefficients 388 if (BASIS_DIM == 1) { 389 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 390 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 391 } else if (BASIS_DIM == 2) { 392 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 393 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 394 } else if (BASIS_DIM == 3) { 395 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 396 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 397 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 398 } 399 } 400 } 401