1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D> 20 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 21 __syncthreads(); 22 data.slice[data.t_id_x] = *U; 23 __syncthreads(); 24 *V = 0.0; 25 if (data.t_id_x < Q_1D) { 26 for (CeedInt i = 0; i < P_1D; i++) { 27 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 28 } 29 } 30 } 31 32 //------------------------------------------------------------------------------ 33 // 1D transpose tensor contraction x 34 //------------------------------------------------------------------------------ 35 template <int NUM_COMP, int P_1D, int Q_1D> 36 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 37 __syncthreads(); 38 data.slice[data.t_id_x] = *U; 39 __syncthreads(); 40 *V = 0.0; 41 if (data.t_id_x < P_1D) { 42 for (CeedInt i = 0; i < Q_1D; i++) { 43 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 44 } 45 } 46 } 47 48 //------------------------------------------------------------------------------ 49 // 1D interpolate to quadrature points 50 //------------------------------------------------------------------------------ 51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 52 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 // 1D interpolate transpose 60 //------------------------------------------------------------------------------ 61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 62 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 63 CeedScalar *__restrict__ r_V) { 64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D derivatives at quadrature points 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 73 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 74 CeedScalar *__restrict__ r_V) { 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 77 } 78 } 79 80 //------------------------------------------------------------------------------ 81 // 1D derivatives transpose 82 //------------------------------------------------------------------------------ 83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 84 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 85 CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D quadrature weights 93 //------------------------------------------------------------------------------ 94 template <int P_1D, int Q_1D> 95 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 96 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 97 } 98 99 //------------------------------------------------------------------------------ 100 // 2D 101 //------------------------------------------------------------------------------ 102 103 //------------------------------------------------------------------------------ 104 // 2D tensor contraction x 105 //------------------------------------------------------------------------------ 106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 107 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 108 __syncthreads(); 109 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 110 __syncthreads(); 111 *V = 0.0; 112 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 113 for (CeedInt i = 0; i < P_1D; i++) { 114 *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 115 } 116 } 117 } 118 119 //------------------------------------------------------------------------------ 120 // 2D tensor contract y 121 //------------------------------------------------------------------------------ 122 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 123 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 124 __syncthreads(); 125 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 126 __syncthreads(); 127 *V = 0.0; 128 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 129 for (CeedInt i = 0; i < P_1D; i++) { 130 *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 131 } 132 } 133 } 134 135 //------------------------------------------------------------------------------ 136 // 2D transpose tensor contract y 137 //------------------------------------------------------------------------------ 138 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 139 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 140 __syncthreads(); 141 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 142 __syncthreads(); 143 *V = 0.0; 144 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 145 for (CeedInt i = 0; i < Q_1D; i++) { 146 *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 147 } 148 } 149 } 150 151 //------------------------------------------------------------------------------ 152 // 2D transpose tensor contract x 153 //------------------------------------------------------------------------------ 154 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 155 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 156 __syncthreads(); 157 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 158 __syncthreads(); 159 *V = 0.0; 160 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 161 for (CeedInt i = 0; i < Q_1D; i++) { 162 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 163 } 164 } 165 } 166 167 //------------------------------------------------------------------------------ 168 // 2D transpose tensor contract and add x 169 //------------------------------------------------------------------------------ 170 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 171 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 172 __syncthreads(); 173 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 174 __syncthreads(); 175 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 176 for (CeedInt i = 0; i < Q_1D; i++) { 177 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 178 } 179 } 180 } 181 182 //------------------------------------------------------------------------------ 183 // 2D interpolate to quadrature points 184 //------------------------------------------------------------------------------ 185 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 186 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 187 CeedScalar r_t[1]; 188 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 189 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 190 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 191 } 192 } 193 194 //------------------------------------------------------------------------------ 195 // 2D interpolate transpose 196 //------------------------------------------------------------------------------ 197 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 198 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 199 CeedScalar *__restrict__ r_V) { 200 CeedScalar r_t[1]; 201 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 202 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 203 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 204 } 205 } 206 207 //------------------------------------------------------------------------------ 208 // 2D derivatives at quadrature points 209 //------------------------------------------------------------------------------ 210 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 211 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 212 CeedScalar *__restrict__ r_V) { 213 CeedScalar r_t[1]; 214 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 215 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t); 216 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 217 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 218 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 219 } 220 } 221 222 //------------------------------------------------------------------------------ 223 // 2D derivatives transpose 224 //------------------------------------------------------------------------------ 225 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 226 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 227 CeedScalar *__restrict__ r_V) { 228 CeedScalar r_t[1]; 229 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 230 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 231 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]); 232 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 233 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 234 } 235 } 236 237 //------------------------------------------------------------------------------ 238 // 2D derivatives at quadrature points, nodes and quadrature points collocated 239 //------------------------------------------------------------------------------ 240 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 241 inline __device__ void GradTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 242 CeedScalar *__restrict__ r_V) { 243 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 244 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); 245 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); 246 } 247 } 248 249 //------------------------------------------------------------------------------ 250 // 2D derivatives transpose, nodes and quadrature points collocated 251 //------------------------------------------------------------------------------ 252 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 253 inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 254 CeedScalar *__restrict__ r_V) { 255 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 256 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); 257 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); 258 } 259 } 260 261 //------------------------------------------------------------------------------ 262 // 2D quadrature weights 263 //------------------------------------------------------------------------------ 264 template <int P_1D, int Q_1D> 265 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 266 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 267 } 268 269 //------------------------------------------------------------------------------ 270 // 3D 271 //------------------------------------------------------------------------------ 272 273 //------------------------------------------------------------------------------ 274 // 3D tensor contract x 275 //------------------------------------------------------------------------------ 276 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 277 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 278 CeedScalar r_B[P_1D]; 279 for (CeedInt i = 0; i < P_1D; i++) { 280 r_B[i] = B[i + data.t_id_x * P_1D]; 281 } 282 283 for (CeedInt k = 0; k < P_1D; k++) { 284 __syncthreads(); 285 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 286 __syncthreads(); 287 V[k] = 0.0; 288 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 289 for (CeedInt i = 0; i < P_1D; i++) { 290 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 291 } 292 } 293 } 294 } 295 296 //------------------------------------------------------------------------------ 297 // 3D tensor contract y 298 //------------------------------------------------------------------------------ 299 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 300 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 301 CeedScalar r_B[P_1D]; 302 for (CeedInt i = 0; i < P_1D; i++) { 303 r_B[i] = B[i + data.t_id_y * P_1D]; 304 } 305 306 for (CeedInt k = 0; k < P_1D; k++) { 307 __syncthreads(); 308 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 309 __syncthreads(); 310 V[k] = 0.0; 311 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 312 for (CeedInt i = 0; i < P_1D; i++) { 313 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 314 } 315 } 316 } 317 } 318 319 //------------------------------------------------------------------------------ 320 // 3D tensor contract z 321 //------------------------------------------------------------------------------ 322 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 323 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 324 for (CeedInt k = 0; k < Q_1D; k++) { 325 V[k] = 0.0; 326 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 327 for (CeedInt i = 0; i < P_1D; i++) { 328 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 329 } 330 } 331 } 332 } 333 334 //------------------------------------------------------------------------------ 335 // 3D transpose tensor contract z 336 //------------------------------------------------------------------------------ 337 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 338 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 339 for (CeedInt k = 0; k < P_1D; k++) { 340 V[k] = 0.0; 341 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 342 for (CeedInt i = 0; i < Q_1D; i++) { 343 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 344 } 345 } 346 } 347 } 348 349 //------------------------------------------------------------------------------ 350 // 3D transpose tensor contract y 351 //------------------------------------------------------------------------------ 352 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 353 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 354 CeedScalar r_B[Q_1D]; 355 for (CeedInt i = 0; i < Q_1D; i++) { 356 r_B[i] = B[data.t_id_y + i * P_1D]; 357 } 358 359 for (CeedInt k = 0; k < P_1D; k++) { 360 __syncthreads(); 361 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 362 __syncthreads(); 363 V[k] = 0.0; 364 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 365 for (CeedInt i = 0; i < Q_1D; i++) { 366 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 367 } 368 } 369 } 370 } 371 372 //------------------------------------------------------------------------------ 373 // 3D transpose tensor contract y 374 //------------------------------------------------------------------------------ 375 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 376 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 377 CeedScalar r_B[Q_1D]; 378 for (CeedInt i = 0; i < Q_1D; i++) { 379 r_B[i] = B[data.t_id_y + i * P_1D]; 380 } 381 382 for (CeedInt k = 0; k < P_1D; k++) { 383 __syncthreads(); 384 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 385 __syncthreads(); 386 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 387 for (CeedInt i = 0; i < Q_1D; i++) { 388 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 389 } 390 } 391 } 392 } 393 394 //------------------------------------------------------------------------------ 395 // 3D transpose tensor contract x 396 //------------------------------------------------------------------------------ 397 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 398 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 399 CeedScalar r_B[Q_1D]; 400 for (CeedInt i = 0; i < Q_1D; i++) { 401 r_B[i] = B[data.t_id_x + i * P_1D]; 402 } 403 404 for (CeedInt k = 0; k < P_1D; k++) { 405 __syncthreads(); 406 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 407 __syncthreads(); 408 V[k] = 0.0; 409 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 410 for (CeedInt i = 0; i < Q_1D; i++) { 411 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 412 } 413 } 414 } 415 } 416 417 //------------------------------------------------------------------------------ 418 // 3D transpose tensor contract add x 419 //------------------------------------------------------------------------------ 420 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 421 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 422 CeedScalar r_B[Q_1D]; 423 for (CeedInt i = 0; i < Q_1D; i++) { 424 r_B[i] = B[data.t_id_x + i * P_1D]; 425 } 426 427 for (CeedInt k = 0; k < P_1D; k++) { 428 __syncthreads(); 429 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 430 __syncthreads(); 431 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 432 for (CeedInt i = 0; i < Q_1D; i++) { 433 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 434 } 435 } 436 } 437 } 438 439 //------------------------------------------------------------------------------ 440 // 3D interpolate to quadrature points 441 //------------------------------------------------------------------------------ 442 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 443 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 444 CeedScalar r_t1[T_1D]; 445 CeedScalar r_t2[T_1D]; 446 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 447 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 448 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 449 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 450 } 451 } 452 453 //------------------------------------------------------------------------------ 454 // 3D interpolate transpose 455 //------------------------------------------------------------------------------ 456 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 457 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 458 CeedScalar *__restrict__ r_V) { 459 CeedScalar r_t1[T_1D]; 460 CeedScalar r_t2[T_1D]; 461 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 462 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 463 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 464 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 465 } 466 } 467 468 //------------------------------------------------------------------------------ 469 // 3D derivatives at quadrature points 470 //------------------------------------------------------------------------------ 471 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 472 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 473 CeedScalar *__restrict__ r_V) { 474 CeedScalar r_t1[T_1D]; 475 CeedScalar r_t2[T_1D]; 476 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 477 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 478 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 479 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 480 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 481 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 482 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 483 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 484 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 485 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 486 } 487 } 488 489 //------------------------------------------------------------------------------ 490 // 3D derivatives transpose 491 //------------------------------------------------------------------------------ 492 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 493 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 494 CeedScalar *__restrict__ r_V) { 495 CeedScalar r_t1[T_1D]; 496 CeedScalar r_t2[T_1D]; 497 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 498 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 499 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 500 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 501 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 502 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 503 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 504 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 505 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 506 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 507 } 508 } 509 510 //------------------------------------------------------------------------------ 511 // 3D derivatives at quadrature points 512 //------------------------------------------------------------------------------ 513 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 514 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 515 CeedScalar *__restrict__ r_V) { 516 CeedScalar r_t1[T_1D]; 517 CeedScalar r_t2[T_1D]; 518 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 519 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 520 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 521 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 522 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 523 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 524 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 525 } 526 } 527 528 //------------------------------------------------------------------------------ 529 // 3D derivatives transpose 530 //------------------------------------------------------------------------------ 531 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 532 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 533 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 534 CeedScalar r_t1[T_1D]; 535 CeedScalar r_t2[T_1D]; 536 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 537 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 538 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 539 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 540 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 541 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 542 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 543 } 544 } 545 546 //------------------------------------------------------------------------------ 547 // 3D derivatives at quadrature points, nodes and quadrature points collocated 548 //------------------------------------------------------------------------------ 549 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 550 inline __device__ void GradTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 551 CeedScalar *__restrict__ r_V) { 552 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 553 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 554 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 555 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 556 } 557 } 558 559 //------------------------------------------------------------------------------ 560 // 3D derivatives transpose, nodes and quadrature points collocated 561 //------------------------------------------------------------------------------ 562 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 563 inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 564 CeedScalar *__restrict__ r_V) { 565 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 566 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 567 ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 568 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 569 } 570 } 571 572 //------------------------------------------------------------------------------ 573 // 3D quadrature weights 574 //------------------------------------------------------------------------------ 575 template <int P_1D, int Q_1D> 576 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 577 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 578 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 579 for (CeedInt q = 0; q < Q_1D; q++) { 580 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 581 } 582 } 583