1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA tensor product basis with AtPoints evaluation 10 #include <ceed/types.h> 11 12 #include "cuda-shared-basis-read-write-templates.h" 13 #include "cuda-shared-basis-tensor-at-points-templates.h" 14 #include "cuda-shared-basis-tensor-templates.h" 15 16 //------------------------------------------------------------------------------ 17 // Tensor Basis Kernels AtPoints 18 //------------------------------------------------------------------------------ 19 20 //------------------------------------------------------------------------------ 21 // Interp 22 //------------------------------------------------------------------------------ 23 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, 24 const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 25 extern __shared__ CeedScalar slice[]; 26 27 SharedData_Cuda data; 28 data.t_id_x = threadIdx.x; 29 data.t_id_y = threadIdx.y; 30 data.t_id_z = threadIdx.z; 31 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 32 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 33 34 CeedScalar r_X[BASIS_DIM]; 35 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 36 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 37 CeedScalar r_V[BASIS_NUM_COMP]; 38 39 // load interp_1d into shared memory 40 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 41 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 42 __syncthreads(); 43 44 // Apply basis element by element 45 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 46 // Map to coefficients 47 if (BASIS_DIM == 1) { 48 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 49 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 50 } else if (BASIS_DIM == 2) { 51 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 52 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 53 } else if (BASIS_DIM == 3) { 54 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 55 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 56 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 57 } 58 59 // Map to points 60 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 61 62 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 63 const CeedInt p = i % BASIS_NUM_PTS; 64 65 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 66 if (BASIS_DIM == 1) { 67 InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 68 } else if (BASIS_DIM == 2) { 69 InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 70 } else if (BASIS_DIM == 3) { 71 InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 72 } 73 WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 74 } 75 } 76 } 77 78 extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 79 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 80 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 81 extern __shared__ CeedScalar slice[]; 82 83 SharedData_Cuda data; 84 data.t_id_x = threadIdx.x; 85 data.t_id_y = threadIdx.y; 86 data.t_id_z = threadIdx.z; 87 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 88 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 89 90 CeedScalar r_X[BASIS_DIM]; 91 CeedScalar r_U[BASIS_NUM_COMP]; 92 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 93 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 94 95 // load interp_1d into shared memory 96 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 97 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 98 __syncthreads(); 99 100 // Apply basis element by element 101 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 102 // Clear register 103 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 104 105 // Clear output vector 106 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; 107 if (BASIS_DIM == 1) { 108 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 109 } else if (BASIS_DIM == 2) { 110 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 111 } else if (BASIS_DIM == 3) { 112 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 113 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 114 } 115 116 // Map from points 117 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 118 119 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 120 const CeedInt p = i % BASIS_NUM_PTS; 121 122 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 123 ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 124 if (BASIS_DIM == 1) { 125 InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 126 } else if (BASIS_DIM == 2) { 127 InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 128 } else if (BASIS_DIM == 3) { 129 InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 130 } 131 } 132 133 // Map from coefficients 134 if (BASIS_DIM == 1) { 135 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 136 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 137 } else if (BASIS_DIM == 2) { 138 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 139 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 140 } else if (BASIS_DIM == 3) { 141 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 142 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 143 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 144 } 145 } 146 } 147 148 extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 149 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 150 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 151 extern __shared__ CeedScalar slice[]; 152 153 SharedData_Cuda data; 154 data.t_id_x = threadIdx.x; 155 data.t_id_y = threadIdx.y; 156 data.t_id_z = threadIdx.z; 157 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 158 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 159 160 CeedScalar r_X[BASIS_DIM]; 161 CeedScalar r_U[BASIS_NUM_COMP]; 162 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 163 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 164 165 // load interp_1d into shared memory 166 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 167 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 168 __syncthreads(); 169 170 // Apply basis element by element 171 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 172 // Clear register 173 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 174 175 // Map from points 176 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 177 178 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 179 const CeedInt p = i % BASIS_NUM_PTS; 180 181 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 182 ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); 183 if (BASIS_DIM == 1) { 184 InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 185 } else if (BASIS_DIM == 2) { 186 InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 187 } else if (BASIS_DIM == 3) { 188 InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 189 } 190 } 191 192 // Map from coefficients 193 if (BASIS_DIM == 1) { 194 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 195 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 196 } else if (BASIS_DIM == 2) { 197 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 198 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 199 } else if (BASIS_DIM == 3) { 200 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 201 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 202 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 203 } 204 } 205 } 206 207 //------------------------------------------------------------------------------ 208 // Grad 209 //------------------------------------------------------------------------------ 210 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, 211 const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 212 extern __shared__ CeedScalar slice[]; 213 214 SharedData_Cuda data; 215 data.t_id_x = threadIdx.x; 216 data.t_id_y = threadIdx.y; 217 data.t_id_z = threadIdx.z; 218 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 219 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 220 221 CeedScalar r_X[BASIS_DIM]; 222 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 223 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 224 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; 225 226 // load interp_1d into shared memory 227 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 228 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 229 __syncthreads(); 230 231 // Apply basis element by element 232 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 233 // Map to coefficients 234 if (BASIS_DIM == 1) { 235 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 236 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 237 } else if (BASIS_DIM == 2) { 238 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 239 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 240 } else if (BASIS_DIM == 3) { 241 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 242 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 243 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_C); 244 } 245 246 // Map to points 247 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 248 249 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 250 const CeedInt p = i % BASIS_NUM_PTS; 251 252 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 253 if (BASIS_DIM == 1) { 254 GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 255 } else if (BASIS_DIM == 2) { 256 GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 257 } else if (BASIS_DIM == 3) { 258 GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_C, r_X, r_V); 259 } 260 WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); 261 } 262 } 263 } 264 265 extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 266 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 267 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 268 extern __shared__ CeedScalar slice[]; 269 270 SharedData_Cuda data; 271 data.t_id_x = threadIdx.x; 272 data.t_id_y = threadIdx.y; 273 data.t_id_z = threadIdx.z; 274 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 275 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 276 277 CeedScalar r_X[BASIS_DIM]; 278 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 279 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 280 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 281 282 // load interp_1d into shared memory 283 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 284 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 285 __syncthreads(); 286 287 // Apply basis element by element 288 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 289 // Clear register 290 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 291 292 // Clear output vector 293 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_V[i] = 0.0; 294 if (BASIS_DIM == 1) { 295 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 296 } else if (BASIS_DIM == 2) { 297 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 298 } else if (BASIS_DIM == 3) { 299 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 300 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 301 } 302 303 // Map from points 304 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 305 306 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 307 const CeedInt p = i % BASIS_NUM_PTS; 308 309 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 310 ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 311 r_U); 312 if (BASIS_DIM == 1) { 313 GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 314 } else if (BASIS_DIM == 2) { 315 GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 316 } else if (BASIS_DIM == 3) { 317 GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 318 } 319 } 320 321 // Map from coefficients 322 if (BASIS_DIM == 1) { 323 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 324 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 325 } else if (BASIS_DIM == 2) { 326 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 327 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 328 } else if (BASIS_DIM == 3) { 329 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 330 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 331 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 332 } 333 } 334 } 335 336 extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, 337 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, 338 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 339 extern __shared__ CeedScalar slice[]; 340 341 SharedData_Cuda data; 342 data.t_id_x = threadIdx.x; 343 data.t_id_y = threadIdx.y; 344 data.t_id_z = threadIdx.z; 345 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 346 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 347 348 CeedScalar r_X[BASIS_DIM]; 349 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; 350 CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 351 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 352 353 // load interp_1d into shared memory 354 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 355 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 356 __syncthreads(); 357 358 // Apply basis element by element 359 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 360 // Clear register 361 for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; 362 363 // Map from points 364 const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); 365 366 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { 367 const CeedInt p = i % BASIS_NUM_PTS; 368 369 ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); 370 ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, 371 r_U); 372 if (BASIS_DIM == 1) { 373 GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 374 } else if (BASIS_DIM == 2) { 375 GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 376 } else if (BASIS_DIM == 3) { 377 GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C); 378 } 379 } 380 381 // Map from coefficients 382 if (BASIS_DIM == 1) { 383 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 384 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 385 } else if (BASIS_DIM == 2) { 386 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 387 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 388 } else if (BASIS_DIM == 3) { 389 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_C, s_B, r_V); 390 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 391 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 392 } 393 } 394 } 395