1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D> 20 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 21 __syncthreads(); 22 data.slice[data.t_id_x] = *U; 23 __syncthreads(); 24 *V = 0.0; 25 if (data.t_id_x < Q_1D) { 26 for (CeedInt i = 0; i < P_1D; i++) { 27 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 28 } 29 } 30 } 31 32 //------------------------------------------------------------------------------ 33 // 1D transpose tensor contraction x 34 //------------------------------------------------------------------------------ 35 template <int NUM_COMP, int P_1D, int Q_1D> 36 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 37 __syncthreads(); 38 data.slice[data.t_id_x] = *U; 39 __syncthreads(); 40 *V = 0.0; 41 if (data.t_id_x < P_1D) { 42 for (CeedInt i = 0; i < Q_1D; i++) { 43 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 44 } 45 } 46 } 47 48 //------------------------------------------------------------------------------ 49 // 1D interpolate to quadrature points 50 //------------------------------------------------------------------------------ 51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 52 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 // 1D interpolate transpose 60 //------------------------------------------------------------------------------ 61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 62 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 63 CeedScalar *__restrict__ r_V) { 64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D interpolate to quadrature points, nodes and quadrature points collocated 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 73 inline __device__ void InterpCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 74 CeedScalar *__restrict__ r_V) { 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76 r_V[comp] = r_U[comp]; 77 } 78 } 79 80 //------------------------------------------------------------------------------ 81 // 1D interpolate transpose, nodes and quadrature points collocated 82 //------------------------------------------------------------------------------ 83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 84 inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 85 CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 r_V[comp] = r_U[comp]; 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D derivatives at quadrature points 93 //------------------------------------------------------------------------------ 94 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 95 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 96 CeedScalar *__restrict__ r_V) { 97 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 98 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 99 } 100 } 101 102 //------------------------------------------------------------------------------ 103 // 1D derivatives transpose 104 //------------------------------------------------------------------------------ 105 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 106 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 107 CeedScalar *__restrict__ r_V) { 108 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 109 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 110 } 111 } 112 113 //------------------------------------------------------------------------------ 114 // 1D quadrature weights 115 //------------------------------------------------------------------------------ 116 template <int P_1D, int Q_1D> 117 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 118 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 119 } 120 121 //------------------------------------------------------------------------------ 122 // 2D 123 //------------------------------------------------------------------------------ 124 125 //------------------------------------------------------------------------------ 126 // 2D tensor contraction x 127 //------------------------------------------------------------------------------ 128 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 129 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 130 __syncthreads(); 131 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 132 __syncthreads(); 133 *V = 0.0; 134 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 135 for (CeedInt i = 0; i < P_1D; i++) { 136 *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 137 } 138 } 139 } 140 141 //------------------------------------------------------------------------------ 142 // 2D tensor contract y 143 //------------------------------------------------------------------------------ 144 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 145 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 146 __syncthreads(); 147 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 148 __syncthreads(); 149 *V = 0.0; 150 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 151 for (CeedInt i = 0; i < P_1D; i++) { 152 *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 153 } 154 } 155 } 156 157 //------------------------------------------------------------------------------ 158 // 2D transpose tensor contract y 159 //------------------------------------------------------------------------------ 160 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 161 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 162 __syncthreads(); 163 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 164 __syncthreads(); 165 *V = 0.0; 166 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 167 for (CeedInt i = 0; i < Q_1D; i++) { 168 *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 169 } 170 } 171 } 172 173 //------------------------------------------------------------------------------ 174 // 2D transpose tensor contract x 175 //------------------------------------------------------------------------------ 176 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 177 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 178 __syncthreads(); 179 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 180 __syncthreads(); 181 *V = 0.0; 182 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 183 for (CeedInt i = 0; i < Q_1D; i++) { 184 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 185 } 186 } 187 } 188 189 //------------------------------------------------------------------------------ 190 // 2D transpose tensor contract and add x 191 //------------------------------------------------------------------------------ 192 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 193 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 194 __syncthreads(); 195 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 196 __syncthreads(); 197 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 198 for (CeedInt i = 0; i < Q_1D; i++) { 199 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 200 } 201 } 202 } 203 204 //------------------------------------------------------------------------------ 205 // 2D interpolate to quadrature points 206 //------------------------------------------------------------------------------ 207 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 208 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 209 CeedScalar r_t[1]; 210 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 211 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 212 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 213 } 214 } 215 216 //------------------------------------------------------------------------------ 217 // 2D interpolate transpose 218 //------------------------------------------------------------------------------ 219 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 220 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 221 CeedScalar *__restrict__ r_V) { 222 CeedScalar r_t[1]; 223 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 224 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 225 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 226 } 227 } 228 229 //------------------------------------------------------------------------------ 230 // 2D interpolate to quadrature points, nodes and quadrature points collocated 231 //------------------------------------------------------------------------------ 232 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 233 inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 234 CeedScalar *__restrict__ r_V) { 235 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 236 r_V[comp] = r_U[comp]; 237 } 238 } 239 240 //------------------------------------------------------------------------------ 241 // 2D interpolate transpose, nodes and quadrature points collocated 242 //------------------------------------------------------------------------------ 243 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 244 inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 245 CeedScalar *__restrict__ r_V) { 246 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 247 r_V[comp] = r_U[comp]; 248 } 249 } 250 251 //------------------------------------------------------------------------------ 252 // 2D derivatives at quadrature points 253 //------------------------------------------------------------------------------ 254 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 255 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 256 CeedScalar *__restrict__ r_V) { 257 CeedScalar r_t[1]; 258 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 259 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t); 260 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 261 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 262 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 263 } 264 } 265 266 //------------------------------------------------------------------------------ 267 // 2D derivatives transpose 268 //------------------------------------------------------------------------------ 269 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 270 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 271 CeedScalar *__restrict__ r_V) { 272 CeedScalar r_t[1]; 273 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 274 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 275 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]); 276 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 277 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 278 } 279 } 280 281 //------------------------------------------------------------------------------ 282 // 2D derivatives at quadrature points, nodes and quadrature points collocated 283 //------------------------------------------------------------------------------ 284 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 285 inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 286 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 287 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 288 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); 289 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); 290 } 291 } 292 293 //------------------------------------------------------------------------------ 294 // 2D derivatives transpose, nodes and quadrature points collocated 295 //------------------------------------------------------------------------------ 296 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 297 inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 298 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 299 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 300 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); 301 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); 302 } 303 } 304 305 //------------------------------------------------------------------------------ 306 // 2D quadrature weights 307 //------------------------------------------------------------------------------ 308 template <int P_1D, int Q_1D> 309 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 310 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 311 } 312 313 //------------------------------------------------------------------------------ 314 // 3D 315 //------------------------------------------------------------------------------ 316 317 //------------------------------------------------------------------------------ 318 // 3D tensor contract x 319 //------------------------------------------------------------------------------ 320 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 321 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 322 CeedScalar r_B[P_1D]; 323 for (CeedInt i = 0; i < P_1D; i++) { 324 r_B[i] = B[i + data.t_id_x * P_1D]; 325 } 326 327 for (CeedInt k = 0; k < P_1D; k++) { 328 __syncthreads(); 329 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 330 __syncthreads(); 331 V[k] = 0.0; 332 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 333 for (CeedInt i = 0; i < P_1D; i++) { 334 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 335 } 336 } 337 } 338 } 339 340 //------------------------------------------------------------------------------ 341 // 3D tensor contract y 342 //------------------------------------------------------------------------------ 343 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 344 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 345 CeedScalar r_B[P_1D]; 346 for (CeedInt i = 0; i < P_1D; i++) { 347 r_B[i] = B[i + data.t_id_y * P_1D]; 348 } 349 350 for (CeedInt k = 0; k < P_1D; k++) { 351 __syncthreads(); 352 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 353 __syncthreads(); 354 V[k] = 0.0; 355 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 356 for (CeedInt i = 0; i < P_1D; i++) { 357 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 358 } 359 } 360 } 361 } 362 363 //------------------------------------------------------------------------------ 364 // 3D tensor contract z 365 //------------------------------------------------------------------------------ 366 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 367 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 368 for (CeedInt k = 0; k < Q_1D; k++) { 369 V[k] = 0.0; 370 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 371 for (CeedInt i = 0; i < P_1D; i++) { 372 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 373 } 374 } 375 } 376 } 377 378 //------------------------------------------------------------------------------ 379 // 3D transpose tensor contract z 380 //------------------------------------------------------------------------------ 381 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 382 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 383 for (CeedInt k = 0; k < P_1D; k++) { 384 V[k] = 0.0; 385 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 386 for (CeedInt i = 0; i < Q_1D; i++) { 387 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 388 } 389 } 390 } 391 } 392 393 //------------------------------------------------------------------------------ 394 // 3D transpose tensor contract y 395 //------------------------------------------------------------------------------ 396 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 397 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 398 CeedScalar r_B[Q_1D]; 399 for (CeedInt i = 0; i < Q_1D; i++) { 400 r_B[i] = B[data.t_id_y + i * P_1D]; 401 } 402 403 for (CeedInt k = 0; k < P_1D; k++) { 404 __syncthreads(); 405 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 406 __syncthreads(); 407 V[k] = 0.0; 408 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 409 for (CeedInt i = 0; i < Q_1D; i++) { 410 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 411 } 412 } 413 } 414 } 415 416 //------------------------------------------------------------------------------ 417 // 3D transpose tensor contract y 418 //------------------------------------------------------------------------------ 419 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 420 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 421 CeedScalar r_B[Q_1D]; 422 for (CeedInt i = 0; i < Q_1D; i++) { 423 r_B[i] = B[data.t_id_y + i * P_1D]; 424 } 425 426 for (CeedInt k = 0; k < P_1D; k++) { 427 __syncthreads(); 428 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 429 __syncthreads(); 430 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 431 for (CeedInt i = 0; i < Q_1D; i++) { 432 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 433 } 434 } 435 } 436 } 437 438 //------------------------------------------------------------------------------ 439 // 3D transpose tensor contract x 440 //------------------------------------------------------------------------------ 441 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 442 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 443 CeedScalar r_B[Q_1D]; 444 for (CeedInt i = 0; i < Q_1D; i++) { 445 r_B[i] = B[data.t_id_x + i * P_1D]; 446 } 447 448 for (CeedInt k = 0; k < P_1D; k++) { 449 __syncthreads(); 450 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 451 __syncthreads(); 452 V[k] = 0.0; 453 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 454 for (CeedInt i = 0; i < Q_1D; i++) { 455 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 456 } 457 } 458 } 459 } 460 461 //------------------------------------------------------------------------------ 462 // 3D transpose tensor contract add x 463 //------------------------------------------------------------------------------ 464 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 465 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 466 CeedScalar r_B[Q_1D]; 467 for (CeedInt i = 0; i < Q_1D; i++) { 468 r_B[i] = B[data.t_id_x + i * P_1D]; 469 } 470 471 for (CeedInt k = 0; k < P_1D; k++) { 472 __syncthreads(); 473 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 474 __syncthreads(); 475 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 476 for (CeedInt i = 0; i < Q_1D; i++) { 477 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 478 } 479 } 480 } 481 } 482 483 //------------------------------------------------------------------------------ 484 // 3D interpolate to quadrature points 485 //------------------------------------------------------------------------------ 486 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 487 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 488 CeedScalar r_t1[T_1D]; 489 CeedScalar r_t2[T_1D]; 490 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 491 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 492 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 493 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 494 } 495 } 496 497 //------------------------------------------------------------------------------ 498 // 3D interpolate transpose 499 //------------------------------------------------------------------------------ 500 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 501 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 502 CeedScalar *__restrict__ r_V) { 503 CeedScalar r_t1[T_1D]; 504 CeedScalar r_t2[T_1D]; 505 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 506 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 507 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 508 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 509 } 510 } 511 512 //------------------------------------------------------------------------------ 513 // 3D interpolate to quadrature points, nodes and quadrature points collocated 514 //------------------------------------------------------------------------------ 515 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 516 inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 517 CeedScalar *__restrict__ r_V) { 518 for (CeedInt i = 0; i < Q_1D; i++) { 519 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 520 r_V[i + comp * Q_1D] = r_U[i + comp * P_1D]; 521 } 522 } 523 } 524 525 //------------------------------------------------------------------------------ 526 // 3D interpolate transpose, nodes and quadrature points collocated 527 //------------------------------------------------------------------------------ 528 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 529 inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 530 CeedScalar *__restrict__ r_V) { 531 for (CeedInt i = 0; i < Q_1D; i++) { 532 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 533 r_V[i + comp * P_1D] = r_U[i + comp * Q_1D]; 534 } 535 } 536 } 537 538 //------------------------------------------------------------------------------ 539 // 3D derivatives at quadrature points 540 //------------------------------------------------------------------------------ 541 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 542 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 543 CeedScalar *__restrict__ r_V) { 544 CeedScalar r_t1[T_1D]; 545 CeedScalar r_t2[T_1D]; 546 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 547 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 548 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 549 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 550 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 551 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 552 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 553 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 554 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 555 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 556 } 557 } 558 559 //------------------------------------------------------------------------------ 560 // 3D derivatives transpose 561 //------------------------------------------------------------------------------ 562 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 563 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 564 CeedScalar *__restrict__ r_V) { 565 CeedScalar r_t1[T_1D]; 566 CeedScalar r_t2[T_1D]; 567 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 568 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 569 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 570 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 571 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 572 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 573 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 574 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 575 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 576 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 577 } 578 } 579 580 //------------------------------------------------------------------------------ 581 // 3D derivatives at quadrature points 582 //------------------------------------------------------------------------------ 583 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 584 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 585 CeedScalar *__restrict__ r_V) { 586 CeedScalar r_t1[T_1D]; 587 CeedScalar r_t2[T_1D]; 588 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 589 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 590 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 591 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 592 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 593 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 594 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 595 } 596 } 597 598 //------------------------------------------------------------------------------ 599 // 3D derivatives transpose 600 //------------------------------------------------------------------------------ 601 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 602 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 603 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 604 CeedScalar r_t1[T_1D]; 605 CeedScalar r_t2[T_1D]; 606 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 607 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 608 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 609 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 610 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 611 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 612 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 613 } 614 } 615 616 //------------------------------------------------------------------------------ 617 // 3D derivatives at quadrature points, nodes and quadrature points collocated 618 //------------------------------------------------------------------------------ 619 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 620 inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 621 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 622 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 623 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 624 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 625 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 626 } 627 } 628 629 //------------------------------------------------------------------------------ 630 // 3D derivatives transpose, nodes and quadrature points collocated 631 //------------------------------------------------------------------------------ 632 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 633 inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 634 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 635 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 636 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 637 ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 638 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 639 } 640 } 641 642 //------------------------------------------------------------------------------ 643 // 3D quadrature weights 644 //------------------------------------------------------------------------------ 645 template <int P_1D, int Q_1D> 646 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 647 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 648 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 649 for (CeedInt q = 0; q < Q_1D; q++) { 650 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 651 } 652 } 653