1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA tensor product basis with AtPoints evaluation 10 #include <ceed/types.h> 11 12 #include "cuda-shared-basis-read-write-templates.h" 13 #include "cuda-shared-basis-tensor-at-points-templates.h" 14 #include "cuda-shared-basis-tensor-templates.h" 15 16 //------------------------------------------------------------------------------ 17 // Tensor Basis Kernels AtPoints 18 //------------------------------------------------------------------------------ 19 20 //------------------------------------------------------------------------------ 21 // Interp 22 //------------------------------------------------------------------------------ 23 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, 24 const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 25 extern __shared__ CeedScalar slice[]; 26 27 SharedData_Cuda data; 28 data.t_id_x = threadIdx.x; 29 data.t_id_y = threadIdx.y; 30 data.t_id_z = threadIdx.z; 31 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 32 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 33 34 CeedScalar r_X[BASIS_DIM]; 35 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 36 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 37 CeedScalar r_V[BASIS_NUM_COMP]; 38 39 // load interp_1d into shared memory 40 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 41 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 42 __syncthreads(); 43 44 // Apply basis element by element 45 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 46 // Map to coefficients 47 if (BASIS_DIM == 1) { 48 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 49 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 50 } else if (BASIS_DIM == 2) { 51 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 52 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 53 } else if (BASIS_DIM == 3) { 54 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 55 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 56 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 57 } 58 59 // Map to points 60 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 61 62 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 63 const CeedInt p = i % BASIS_NUM_PTS; 64 65 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 66 if (BASIS_DIM == 1) { 67 InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 68 } else if (BASIS_DIM == 2) { 69 InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 70 } else if (BASIS_DIM == 3) { 71 InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 72 } 73 WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 74 } 75 } 76 } 77 78 extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 79 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 80 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 81 extern __shared__ CeedScalar slice[]; 82 83 SharedData_Cuda data; 84 data.t_id_x = threadIdx.x; 85 data.t_id_y = threadIdx.y; 86 data.t_id_z = threadIdx.z; 87 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 88 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 89 90 CeedScalar r_X[BASIS_DIM]; 91 CeedScalar r_U[BASIS_NUM_COMP]; 92 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 93 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 94 95 // load interp_1d into shared memory 96 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 97 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 98 __syncthreads(); 99 100 // Apply basis element by element 101 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 102 // Clear register 103 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 104 105 // Map from points 106 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 107 108 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 109 const CeedInt p = i % BASIS_NUM_PTS; 110 111 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 112 ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 113 if (BASIS_DIM == 1) { 114 InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 115 } else if (BASIS_DIM == 2) { 116 InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 117 } else if (BASIS_DIM == 3) { 118 InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 119 } 120 } 121 __syncthreads(); 122 123 // Map from coefficients 124 if (BASIS_DIM == 1) { 125 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 126 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 127 } else if (BASIS_DIM == 2) { 128 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 129 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 130 } else if (BASIS_DIM == 3) { 131 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 132 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 133 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 134 } 135 } 136 } 137 138 //------------------------------------------------------------------------------ 139 // Grad 140 //------------------------------------------------------------------------------ 141 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, 142 const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 143 extern __shared__ CeedScalar slice[]; 144 145 SharedData_Cuda data; 146 data.t_id_x = threadIdx.x; 147 data.t_id_y = threadIdx.y; 148 data.t_id_z = threadIdx.z; 149 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 150 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 151 152 CeedScalar r_X[BASIS_DIM]; 153 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 154 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 155 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 156 157 // load interp_1d into shared memory 158 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 159 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 160 __syncthreads(); 161 162 // Apply basis element by element 163 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 164 // Map to coefficients 165 if (BASIS_DIM == 1) { 166 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 167 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 168 } else if (BASIS_DIM == 2) { 169 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 170 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 171 } else if (BASIS_DIM == 3) { 172 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 173 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 174 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C); 175 } 176 177 // Map to points 178 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 179 180 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 181 const CeedInt p = i % BASIS_NUM_PTS; 182 183 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 184 if (BASIS_DIM == 1) { 185 GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 186 } else if (BASIS_DIM == 2) { 187 GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 188 } else if (BASIS_DIM == 3) { 189 GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 190 } 191 WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 192 } 193 } 194 } 195 196 extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 197 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 198 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 199 extern __shared__ CeedScalar slice[]; 200 201 SharedData_Cuda data; 202 data.t_id_x = threadIdx.x; 203 data.t_id_y = threadIdx.y; 204 data.t_id_z = threadIdx.z; 205 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 206 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 207 208 CeedScalar r_X[BASIS_DIM]; 209 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 210 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 211 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 212 213 // load interp_1d into shared memory 214 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 215 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 216 __syncthreads(); 217 218 // Apply basis element by element 219 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 220 // Clear register 221 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 222 223 // Map from points 224 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 225 226 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 227 const CeedInt p = i % BASIS_NUM_PTS; 228 229 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 230 ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 231 r_U); 232 if (BASIS_DIM == 1) { 233 GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 234 } else if (BASIS_DIM == 2) { 235 GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 236 } else if (BASIS_DIM == 3) { 237 GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 238 } 239 } 240 __syncthreads(); 241 242 // Map from coefficients 243 if (BASIS_DIM == 1) { 244 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 245 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 246 } else if (BASIS_DIM == 2) { 247 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 248 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 249 } else if (BASIS_DIM == 3) { 250 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V); 251 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 252 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 253 } 254 } 255 } 256