1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D> 20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 21 __syncthreads(); 22 data.slice[data.t_id_x] = *U; 23 __syncthreads(); 24 *V = 0.0; 25 if (data.t_id_x < Q_1D) { 26 for (CeedInt i = 0; i < P_1D; i++) { 27 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 28 } 29 } 30 } 31 32 //------------------------------------------------------------------------------ 33 // 1D transpose tensor contraction x 34 //------------------------------------------------------------------------------ 35 template <int NUM_COMP, int P_1D, int Q_1D> 36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 37 __syncthreads(); 38 data.slice[data.t_id_x] = *U; 39 __syncthreads(); 40 *V = 0.0; 41 if (data.t_id_x < P_1D) { 42 for (CeedInt i = 0; i < Q_1D; i++) { 43 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 44 } 45 } 46 } 47 48 //------------------------------------------------------------------------------ 49 // 1D interpolate to quadrature points 50 //------------------------------------------------------------------------------ 51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 // 1D interpolate transpose 60 //------------------------------------------------------------------------------ 61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 63 CeedScalar *__restrict__ r_V) { 64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D interpolate to quadrature points, nodes and quadrature points collocated 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 73 inline __device__ void InterpCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 74 CeedScalar *__restrict__ r_V) { 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76 r_V[comp] = r_U[comp]; 77 } 78 } 79 80 //------------------------------------------------------------------------------ 81 // 1D interpolate transpose, nodes and quadrature points collocated 82 //------------------------------------------------------------------------------ 83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 84 inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 85 CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 r_V[comp] = r_U[comp]; 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D derivatives at quadrature points 93 //------------------------------------------------------------------------------ 94 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 95 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 96 CeedScalar *__restrict__ r_V) { 97 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 98 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 99 } 100 } 101 102 //------------------------------------------------------------------------------ 103 // 1D derivatives transpose 104 //------------------------------------------------------------------------------ 105 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 106 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 107 CeedScalar *__restrict__ r_V) { 108 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 109 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 110 } 111 } 112 113 //------------------------------------------------------------------------------ 114 // 1D quadrature weights 115 //------------------------------------------------------------------------------ 116 template <int P_1D, int Q_1D> 117 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 118 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 119 } 120 121 //------------------------------------------------------------------------------ 122 // 2D 123 //------------------------------------------------------------------------------ 124 125 //------------------------------------------------------------------------------ 126 // 2D tensor contraction x 127 //------------------------------------------------------------------------------ 128 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 129 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 130 __syncthreads(); 131 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 132 __syncthreads(); 133 *V = 0.0; 134 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 135 for (CeedInt i = 0; i < P_1D; i++) { 136 *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 137 } 138 } 139 } 140 141 //------------------------------------------------------------------------------ 142 // 2D tensor contract y 143 //------------------------------------------------------------------------------ 144 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 145 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 146 __syncthreads(); 147 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 148 __syncthreads(); 149 *V = 0.0; 150 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 151 for (CeedInt i = 0; i < P_1D; i++) { 152 *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 153 } 154 } 155 } 156 157 //------------------------------------------------------------------------------ 158 // 2D transpose tensor contract y 159 //------------------------------------------------------------------------------ 160 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 161 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 162 __syncthreads(); 163 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 164 __syncthreads(); 165 *V = 0.0; 166 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 167 for (CeedInt i = 0; i < Q_1D; i++) { 168 *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 169 } 170 } 171 } 172 173 //------------------------------------------------------------------------------ 174 // 2D transpose tensor contract x 175 //------------------------------------------------------------------------------ 176 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 177 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 178 __syncthreads(); 179 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 180 __syncthreads(); 181 *V = 0.0; 182 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 183 for (CeedInt i = 0; i < Q_1D; i++) { 184 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 185 } 186 } 187 } 188 189 //------------------------------------------------------------------------------ 190 // 2D transpose tensor contract and add x 191 //------------------------------------------------------------------------------ 192 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 193 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 194 __syncthreads(); 195 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 196 __syncthreads(); 197 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 198 for (CeedInt i = 0; i < Q_1D; i++) { 199 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 200 } 201 } 202 } 203 204 //------------------------------------------------------------------------------ 205 // 2D interpolate to quadrature points 206 //------------------------------------------------------------------------------ 207 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 208 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 209 CeedScalar *__restrict__ r_V) { 210 CeedScalar r_t[1]; 211 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 212 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 213 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 214 } 215 } 216 217 //------------------------------------------------------------------------------ 218 // 2D interpolate transpose 219 //------------------------------------------------------------------------------ 220 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 221 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 222 CeedScalar *__restrict__ r_V) { 223 CeedScalar r_t[1]; 224 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 225 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 226 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 227 } 228 } 229 230 //------------------------------------------------------------------------------ 231 // 2D interpolate to quadrature points, nodes and quadrature points collocated 232 //------------------------------------------------------------------------------ 233 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 234 inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 235 CeedScalar *__restrict__ r_V) { 236 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 237 r_V[comp] = r_U[comp]; 238 } 239 } 240 241 //------------------------------------------------------------------------------ 242 // 2D interpolate transpose, nodes and quadrature points collocated 243 //------------------------------------------------------------------------------ 244 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 245 inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 246 CeedScalar *__restrict__ r_V) { 247 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 248 r_V[comp] = r_U[comp]; 249 } 250 } 251 252 //------------------------------------------------------------------------------ 253 // 2D derivatives at quadrature points 254 //------------------------------------------------------------------------------ 255 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 256 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 257 CeedScalar *__restrict__ r_V) { 258 CeedScalar r_t[1]; 259 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 260 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t); 261 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 262 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 263 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 264 } 265 } 266 267 //------------------------------------------------------------------------------ 268 // 2D derivatives transpose 269 //------------------------------------------------------------------------------ 270 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 271 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 272 CeedScalar *__restrict__ r_V) { 273 CeedScalar r_t[1]; 274 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 275 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 276 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]); 277 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 278 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 279 } 280 } 281 282 //------------------------------------------------------------------------------ 283 // 2D derivatives at quadrature points, nodes and quadrature points collocated 284 //------------------------------------------------------------------------------ 285 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 286 inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 287 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 288 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 289 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); 290 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); 291 } 292 } 293 294 //------------------------------------------------------------------------------ 295 // 2D derivatives transpose, nodes and quadrature points collocated 296 //------------------------------------------------------------------------------ 297 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 298 inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 299 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 300 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 301 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); 302 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); 303 } 304 } 305 306 //------------------------------------------------------------------------------ 307 // 2D quadrature weights 308 //------------------------------------------------------------------------------ 309 template <int P_1D, int Q_1D> 310 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 311 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 312 } 313 314 //------------------------------------------------------------------------------ 315 // 3D 316 //------------------------------------------------------------------------------ 317 318 //------------------------------------------------------------------------------ 319 // 3D tensor contract x 320 //------------------------------------------------------------------------------ 321 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 322 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 323 CeedScalar r_B[P_1D]; 324 for (CeedInt i = 0; i < P_1D; i++) { 325 r_B[i] = B[i + data.t_id_x * P_1D]; 326 } 327 328 for (CeedInt k = 0; k < P_1D; k++) { 329 __syncthreads(); 330 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 331 __syncthreads(); 332 V[k] = 0.0; 333 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 334 for (CeedInt i = 0; i < P_1D; i++) { 335 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 336 } 337 } 338 } 339 } 340 341 //------------------------------------------------------------------------------ 342 // 3D tensor contract y 343 //------------------------------------------------------------------------------ 344 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 345 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 346 CeedScalar r_B[P_1D]; 347 for (CeedInt i = 0; i < P_1D; i++) { 348 r_B[i] = B[i + data.t_id_y * P_1D]; 349 } 350 351 for (CeedInt k = 0; k < P_1D; k++) { 352 __syncthreads(); 353 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 354 __syncthreads(); 355 V[k] = 0.0; 356 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 357 for (CeedInt i = 0; i < P_1D; i++) { 358 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 359 } 360 } 361 } 362 } 363 364 //------------------------------------------------------------------------------ 365 // 3D tensor contract z 366 //------------------------------------------------------------------------------ 367 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 368 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 369 for (CeedInt k = 0; k < Q_1D; k++) { 370 V[k] = 0.0; 371 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 372 for (CeedInt i = 0; i < P_1D; i++) { 373 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 374 } 375 } 376 } 377 } 378 379 //------------------------------------------------------------------------------ 380 // 3D transpose tensor contract z 381 //------------------------------------------------------------------------------ 382 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 383 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 384 for (CeedInt k = 0; k < P_1D; k++) { 385 V[k] = 0.0; 386 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 387 for (CeedInt i = 0; i < Q_1D; i++) { 388 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 389 } 390 } 391 } 392 } 393 394 //------------------------------------------------------------------------------ 395 // 3D transpose tensor contract y 396 //------------------------------------------------------------------------------ 397 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 398 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 399 CeedScalar r_B[Q_1D]; 400 for (CeedInt i = 0; i < Q_1D; i++) { 401 r_B[i] = B[data.t_id_y + i * P_1D]; 402 } 403 404 for (CeedInt k = 0; k < P_1D; k++) { 405 __syncthreads(); 406 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 407 __syncthreads(); 408 V[k] = 0.0; 409 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 410 for (CeedInt i = 0; i < Q_1D; i++) { 411 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 412 } 413 } 414 } 415 } 416 417 //------------------------------------------------------------------------------ 418 // 3D transpose tensor contract y 419 //------------------------------------------------------------------------------ 420 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 421 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 422 CeedScalar r_B[Q_1D]; 423 for (CeedInt i = 0; i < Q_1D; i++) { 424 r_B[i] = B[data.t_id_y + i * P_1D]; 425 } 426 427 for (CeedInt k = 0; k < P_1D; k++) { 428 __syncthreads(); 429 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 430 __syncthreads(); 431 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 432 for (CeedInt i = 0; i < Q_1D; i++) { 433 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 434 } 435 } 436 } 437 } 438 439 //------------------------------------------------------------------------------ 440 // 3D transpose tensor contract x 441 //------------------------------------------------------------------------------ 442 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 443 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 444 CeedScalar r_B[Q_1D]; 445 for (CeedInt i = 0; i < Q_1D; i++) { 446 r_B[i] = B[data.t_id_x + i * P_1D]; 447 } 448 449 for (CeedInt k = 0; k < P_1D; k++) { 450 __syncthreads(); 451 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 452 __syncthreads(); 453 V[k] = 0.0; 454 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 455 for (CeedInt i = 0; i < Q_1D; i++) { 456 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 457 } 458 } 459 } 460 } 461 462 //------------------------------------------------------------------------------ 463 // 3D transpose tensor contract add x 464 //------------------------------------------------------------------------------ 465 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 466 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 467 CeedScalar r_B[Q_1D]; 468 for (CeedInt i = 0; i < Q_1D; i++) { 469 r_B[i] = B[data.t_id_x + i * P_1D]; 470 } 471 472 for (CeedInt k = 0; k < P_1D; k++) { 473 __syncthreads(); 474 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 475 __syncthreads(); 476 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 477 for (CeedInt i = 0; i < Q_1D; i++) { 478 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 479 } 480 } 481 } 482 } 483 484 //------------------------------------------------------------------------------ 485 // 3D interpolate to quadrature points 486 //------------------------------------------------------------------------------ 487 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 488 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 489 CeedScalar *__restrict__ r_V) { 490 CeedScalar r_t1[T_1D]; 491 CeedScalar r_t2[T_1D]; 492 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 493 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 494 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 495 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 496 } 497 } 498 499 //------------------------------------------------------------------------------ 500 // 3D interpolate transpose 501 //------------------------------------------------------------------------------ 502 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 503 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 504 CeedScalar *__restrict__ r_V) { 505 CeedScalar r_t1[T_1D]; 506 CeedScalar r_t2[T_1D]; 507 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 508 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 509 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 510 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 511 } 512 } 513 514 //------------------------------------------------------------------------------ 515 // 3D interpolate to quadrature points, nodes and quadrature points collocated 516 //------------------------------------------------------------------------------ 517 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 518 inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 519 CeedScalar *__restrict__ r_V) { 520 for (CeedInt i = 0; i < Q_1D; i++) { 521 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 522 r_V[i + comp * Q_1D] = r_U[i + comp * P_1D]; 523 } 524 } 525 } 526 527 //------------------------------------------------------------------------------ 528 // 3D interpolate transpose, nodes and quadrature points collocated 529 //------------------------------------------------------------------------------ 530 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 531 inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 532 CeedScalar *__restrict__ r_V) { 533 for (CeedInt i = 0; i < Q_1D; i++) { 534 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 535 r_V[i + comp * P_1D] = r_U[i + comp * Q_1D]; 536 } 537 } 538 } 539 540 //------------------------------------------------------------------------------ 541 // 3D derivatives at quadrature points 542 //------------------------------------------------------------------------------ 543 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 544 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 545 CeedScalar *__restrict__ r_V) { 546 CeedScalar r_t1[T_1D]; 547 CeedScalar r_t2[T_1D]; 548 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 549 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 550 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 551 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 552 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 553 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 554 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 555 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 556 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 557 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 558 } 559 } 560 561 //------------------------------------------------------------------------------ 562 // 3D derivatives transpose 563 //------------------------------------------------------------------------------ 564 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 565 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 566 CeedScalar *__restrict__ r_V) { 567 CeedScalar r_t1[T_1D]; 568 CeedScalar r_t2[T_1D]; 569 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 570 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 571 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 572 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 573 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 574 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 575 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 576 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 577 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 578 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 579 } 580 } 581 582 //------------------------------------------------------------------------------ 583 // 3D derivatives at quadrature points 584 //------------------------------------------------------------------------------ 585 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 586 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 587 CeedScalar *__restrict__ r_V) { 588 CeedScalar r_t1[T_1D]; 589 CeedScalar r_t2[T_1D]; 590 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 591 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 592 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 593 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 594 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 595 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 596 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 597 } 598 } 599 600 //------------------------------------------------------------------------------ 601 // 3D derivatives transpose 602 //------------------------------------------------------------------------------ 603 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 604 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 605 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 606 CeedScalar r_t1[T_1D]; 607 CeedScalar r_t2[T_1D]; 608 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 609 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 610 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 611 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 612 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 613 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 614 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 615 } 616 } 617 618 //------------------------------------------------------------------------------ 619 // 3D derivatives at quadrature points, nodes and quadrature points collocated 620 //------------------------------------------------------------------------------ 621 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 622 inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 623 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 624 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 625 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 626 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 627 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 628 } 629 } 630 631 //------------------------------------------------------------------------------ 632 // 3D derivatives transpose, nodes and quadrature points collocated 633 //------------------------------------------------------------------------------ 634 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 635 inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 636 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 637 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 638 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 639 ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 640 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 641 } 642 } 643 644 //------------------------------------------------------------------------------ 645 // 3D quadrature weights 646 //------------------------------------------------------------------------------ 647 template <int P_1D, int Q_1D> 648 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 649 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 650 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 651 for (CeedInt q = 0; q < Q_1D; q++) { 652 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 653 } 654 } 655