1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D> 20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 21 data.slice[data.t_id_x] = *U; 22 __syncthreads(); 23 *V = 0.0; 24 if (data.t_id_x < Q_1D) { 25 for (CeedInt i = 0; i < P_1D; i++) { 26 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 27 } 28 } 29 __syncthreads(); 30 } 31 32 //------------------------------------------------------------------------------ 33 // 1D transpose tensor contraction x 34 //------------------------------------------------------------------------------ 35 template <int NUM_COMP, int P_1D, int Q_1D> 36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 37 data.slice[data.t_id_x] = *U; 38 __syncthreads(); 39 *V = 0.0; 40 if (data.t_id_x < P_1D) { 41 for (CeedInt i = 0; i < Q_1D; i++) { 42 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 43 } 44 } 45 __syncthreads(); 46 } 47 48 //------------------------------------------------------------------------------ 49 // 1D interpolate to quadrature points 50 //------------------------------------------------------------------------------ 51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 // 1D interpolate transpose 60 //------------------------------------------------------------------------------ 61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 63 CeedScalar *__restrict__ r_V) { 64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D derivatives at quadrature points 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 73 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 74 CeedScalar *__restrict__ r_V) { 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 77 } 78 } 79 80 //------------------------------------------------------------------------------ 81 // 1D derivatives transpose 82 //------------------------------------------------------------------------------ 83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 84 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 85 CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D quadrature weights 93 //------------------------------------------------------------------------------ 94 template <int P_1D, int Q_1D> 95 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 96 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 97 } 98 99 //------------------------------------------------------------------------------ 100 // 2D 101 //------------------------------------------------------------------------------ 102 103 //------------------------------------------------------------------------------ 104 // 2D tensor contraction x 105 //------------------------------------------------------------------------------ 106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 107 inline __device__ void ContractX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 108 CeedScalar *V) { 109 data.slice[t_id_x + t_id_y * T_1D] = *U; 110 __syncthreads(); 111 *V = 0.0; 112 if (t_id_x < Q_1D && t_id_y < P_1D) { 113 for (CeedInt i = 0; i < P_1D; i++) { 114 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 115 } 116 } 117 __syncthreads(); 118 } 119 120 //------------------------------------------------------------------------------ 121 // 2D tensor contract y 122 //------------------------------------------------------------------------------ 123 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 124 inline __device__ void ContractY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 125 CeedScalar *V) { 126 data.slice[t_id_x + t_id_y * T_1D] = *U; 127 __syncthreads(); 128 *V = 0.0; 129 if (t_id_x < Q_1D && t_id_y < Q_1D) { 130 for (CeedInt i = 0; i < P_1D; i++) { 131 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 132 } 133 } 134 __syncthreads(); 135 } 136 137 //------------------------------------------------------------------------------ 138 // 2D transpose tensor contract y 139 //------------------------------------------------------------------------------ 140 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 141 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 142 CeedScalar *V) { 143 data.slice[t_id_x + t_id_y * T_1D] = *U; 144 __syncthreads(); 145 *V = 0.0; 146 if (t_id_x < Q_1D && t_id_y < P_1D) { 147 for (CeedInt i = 0; i < Q_1D; i++) { 148 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 149 } 150 } 151 __syncthreads(); 152 } 153 154 //------------------------------------------------------------------------------ 155 // 2D transpose tensor contract x 156 //------------------------------------------------------------------------------ 157 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 158 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 159 CeedScalar *V) { 160 data.slice[t_id_x + t_id_y * T_1D] = *U; 161 __syncthreads(); 162 *V = 0.0; 163 if (t_id_x < P_1D && t_id_y < P_1D) { 164 for (CeedInt i = 0; i < Q_1D; i++) { 165 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 166 } 167 } 168 __syncthreads(); 169 } 170 171 //------------------------------------------------------------------------------ 172 // 2D transpose tensor contract and add x 173 //------------------------------------------------------------------------------ 174 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 175 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 176 CeedScalar *V) { 177 data.slice[t_id_x + t_id_y * T_1D] = *U; 178 __syncthreads(); 179 if (t_id_x < P_1D && t_id_y < P_1D) { 180 for (CeedInt i = 0; i < Q_1D; i++) { 181 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 182 } 183 } 184 __syncthreads(); 185 } 186 187 //------------------------------------------------------------------------------ 188 // 2D pack/unpack quadrature values 189 //------------------------------------------------------------------------------ 190 template <int NUM_COMP, int Q_1D, int T_1D> 191 inline __device__ void QPack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 192 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 193 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 194 __syncthreads(); 195 U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] : 0.0; 196 __syncthreads(); 197 } 198 } 199 200 template <int NUM_COMP, int Q_1D, int T_1D> 201 inline __device__ void QUnpack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 202 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 203 if (data.t_id_x < (Q_1D * Q_1D)) data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] = U[comp]; 204 __syncthreads(); 205 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 206 __syncthreads(); 207 } 208 } 209 210 //------------------------------------------------------------------------------ 211 // 2D interpolate to quadrature points 212 //------------------------------------------------------------------------------ 213 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 214 inline __device__ void InterpTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 215 const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 216 CeedScalar r_t[1]; 217 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 218 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 219 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 220 } 221 } 222 223 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 224 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 225 CeedScalar *__restrict__ r_V) { 226 InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 227 } 228 229 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 230 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 231 CeedScalar *__restrict__ r_V) { 232 QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U); 233 InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V); 234 QPack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V); 235 } 236 237 //------------------------------------------------------------------------------ 238 // 2D interpolate transpose 239 //------------------------------------------------------------------------------ 240 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 241 inline __device__ void InterpTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 242 const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 243 CeedScalar r_t[1]; 244 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 245 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 246 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 247 } 248 } 249 250 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 251 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 252 CeedScalar *__restrict__ r_V) { 253 InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 254 } 255 256 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 257 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 258 CeedScalar *__restrict__ r_V) { 259 QUnpack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U); 260 InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V); 261 QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V); 262 } 263 264 //------------------------------------------------------------------------------ 265 // 2D derivatives at quadrature points 266 //------------------------------------------------------------------------------ 267 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 268 inline __device__ void GradTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 269 const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 270 CeedScalar r_t[1]; 271 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 272 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 273 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 274 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 275 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 276 } 277 } 278 279 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 280 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 281 CeedScalar *__restrict__ r_V) { 282 GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 283 } 284 285 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 286 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 287 CeedScalar *__restrict__ r_V) { 288 QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U); 289 GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V); 290 QPack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V); 291 } 292 293 //------------------------------------------------------------------------------ 294 // 2D derivatives transpose 295 //------------------------------------------------------------------------------ 296 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 297 inline __device__ void GradTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 298 const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 299 CeedScalar r_t[1]; 300 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 301 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 302 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 303 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 304 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 305 } 306 } 307 308 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 309 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 310 CeedScalar *__restrict__ r_V) { 311 GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 312 } 313 314 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 315 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 316 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 317 QUnpack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U); 318 GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V); 319 QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V); 320 } 321 322 //------------------------------------------------------------------------------ 323 // 2D quadrature weights 324 //------------------------------------------------------------------------------ 325 template <int Q_1D> 326 inline __device__ void WeightTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d, 327 CeedScalar *w) { 328 *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 329 } 330 331 template <int P_1D, int Q_1D> 332 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 333 WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w); 334 } 335 336 template <int P_1D, int Q_1D> 337 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 338 WeightTensor2d_Core<Q_1D>(data, data.t_id_x % Q_1D, data.t_id_x / Q_1D, q_weight_1d, w); 339 } 340 341 //------------------------------------------------------------------------------ 342 // 3D 343 //------------------------------------------------------------------------------ 344 345 //------------------------------------------------------------------------------ 346 // 3D tensor contract x 347 //------------------------------------------------------------------------------ 348 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 349 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 350 CeedScalar r_B[P_1D]; 351 for (CeedInt i = 0; i < P_1D; i++) { 352 r_B[i] = B[i + data.t_id_x * P_1D]; 353 } 354 355 for (CeedInt k = 0; k < P_1D; k++) { 356 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 357 __syncthreads(); 358 V[k] = 0.0; 359 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 360 for (CeedInt i = 0; i < P_1D; i++) { 361 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 362 } 363 } 364 __syncthreads(); 365 } 366 } 367 368 //------------------------------------------------------------------------------ 369 // 3D tensor contract y 370 //------------------------------------------------------------------------------ 371 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 372 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 373 CeedScalar r_B[P_1D]; 374 for (CeedInt i = 0; i < P_1D; i++) { 375 r_B[i] = B[i + data.t_id_y * P_1D]; 376 } 377 378 for (CeedInt k = 0; k < P_1D; k++) { 379 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 380 __syncthreads(); 381 V[k] = 0.0; 382 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 383 for (CeedInt i = 0; i < P_1D; i++) { 384 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 385 } 386 } 387 __syncthreads(); 388 } 389 } 390 391 //------------------------------------------------------------------------------ 392 // 3D tensor contract z 393 //------------------------------------------------------------------------------ 394 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 395 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 396 for (CeedInt k = 0; k < Q_1D; k++) { 397 V[k] = 0.0; 398 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 399 for (CeedInt i = 0; i < P_1D; i++) { 400 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 401 } 402 } 403 } 404 } 405 406 //------------------------------------------------------------------------------ 407 // 3D transpose tensor contract z 408 //------------------------------------------------------------------------------ 409 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 410 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 411 for (CeedInt k = 0; k < P_1D; k++) { 412 V[k] = 0.0; 413 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 414 for (CeedInt i = 0; i < Q_1D; i++) { 415 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 416 } 417 } 418 } 419 } 420 421 //------------------------------------------------------------------------------ 422 // 3D transpose tensor contract y 423 //------------------------------------------------------------------------------ 424 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 425 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 426 CeedScalar r_B[Q_1D]; 427 for (CeedInt i = 0; i < Q_1D; i++) { 428 r_B[i] = B[data.t_id_y + i * P_1D]; 429 } 430 431 for (CeedInt k = 0; k < P_1D; k++) { 432 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 433 __syncthreads(); 434 V[k] = 0.0; 435 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 436 for (CeedInt i = 0; i < Q_1D; i++) { 437 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 438 } 439 } 440 __syncthreads(); 441 } 442 } 443 444 //------------------------------------------------------------------------------ 445 // 3D transpose tensor contract y 446 //------------------------------------------------------------------------------ 447 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 448 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 449 CeedScalar r_B[Q_1D]; 450 for (CeedInt i = 0; i < Q_1D; i++) { 451 r_B[i] = B[data.t_id_y + i * P_1D]; 452 } 453 454 for (CeedInt k = 0; k < P_1D; k++) { 455 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 456 __syncthreads(); 457 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 458 for (CeedInt i = 0; i < Q_1D; i++) { 459 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 460 } 461 } 462 __syncthreads(); 463 } 464 } 465 466 //------------------------------------------------------------------------------ 467 // 3D transpose tensor contract x 468 //------------------------------------------------------------------------------ 469 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 470 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 471 CeedScalar r_B[Q_1D]; 472 for (CeedInt i = 0; i < Q_1D; i++) { 473 r_B[i] = B[data.t_id_x + i * P_1D]; 474 } 475 476 for (CeedInt k = 0; k < P_1D; k++) { 477 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 478 __syncthreads(); 479 V[k] = 0.0; 480 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 481 for (CeedInt i = 0; i < Q_1D; i++) { 482 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 483 } 484 } 485 __syncthreads(); 486 } 487 } 488 489 //------------------------------------------------------------------------------ 490 // 3D transpose tensor contract add x 491 //------------------------------------------------------------------------------ 492 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 493 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 494 CeedScalar r_B[Q_1D]; 495 for (CeedInt i = 0; i < Q_1D; i++) { 496 r_B[i] = B[data.t_id_x + i * P_1D]; 497 } 498 499 for (CeedInt k = 0; k < P_1D; k++) { 500 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 501 __syncthreads(); 502 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 503 for (CeedInt i = 0; i < Q_1D; i++) { 504 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 505 } 506 } 507 __syncthreads(); 508 } 509 } 510 511 //------------------------------------------------------------------------------ 512 // 3D interpolate to quadrature points 513 //------------------------------------------------------------------------------ 514 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 515 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 516 CeedScalar *__restrict__ r_V) { 517 CeedScalar r_t1[T_1D]; 518 CeedScalar r_t2[T_1D]; 519 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 520 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 521 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 522 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 523 } 524 } 525 526 //------------------------------------------------------------------------------ 527 // 3D interpolate transpose 528 //------------------------------------------------------------------------------ 529 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 530 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 531 CeedScalar *__restrict__ r_V) { 532 CeedScalar r_t1[T_1D]; 533 CeedScalar r_t2[T_1D]; 534 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 535 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 536 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 537 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 538 } 539 } 540 541 //------------------------------------------------------------------------------ 542 // 3D derivatives at quadrature points 543 //------------------------------------------------------------------------------ 544 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 545 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 546 CeedScalar *__restrict__ r_V) { 547 CeedScalar r_t1[T_1D]; 548 CeedScalar r_t2[T_1D]; 549 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 550 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 551 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 552 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 553 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 554 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 555 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 556 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 557 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 558 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 559 } 560 } 561 562 //------------------------------------------------------------------------------ 563 // 3D derivatives transpose 564 //------------------------------------------------------------------------------ 565 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 566 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 567 CeedScalar *__restrict__ r_V) { 568 CeedScalar r_t1[T_1D]; 569 CeedScalar r_t2[T_1D]; 570 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 571 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 572 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 573 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 574 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 575 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 576 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 577 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 578 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 579 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 580 } 581 } 582 583 //------------------------------------------------------------------------------ 584 // 3D derivatives at quadrature points 585 //------------------------------------------------------------------------------ 586 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 587 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 588 CeedScalar *__restrict__ r_V) { 589 CeedScalar r_t1[T_1D]; 590 CeedScalar r_t2[T_1D]; 591 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 592 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 593 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 594 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 595 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 596 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 597 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 598 } 599 } 600 601 //------------------------------------------------------------------------------ 602 // 3D derivatives transpose 603 //------------------------------------------------------------------------------ 604 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 605 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 606 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 607 CeedScalar r_t1[T_1D]; 608 CeedScalar r_t2[T_1D]; 609 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 610 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 611 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 612 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 613 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 614 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 615 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 616 } 617 } 618 619 //------------------------------------------------------------------------------ 620 // 3D quadrature weights 621 //------------------------------------------------------------------------------ 622 template <int P_1D, int Q_1D> 623 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 624 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 625 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 626 for (CeedInt q = 0; q < Q_1D; q++) { 627 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 628 } 629 } 630