1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D> 20 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 21 data.slice[data.t_id_x] = *U; 22 __syncthreads(); 23 *V = 0.0; 24 if (data.t_id_x < Q_1D) { 25 for (CeedInt i = 0; i < P_1D; i++) { 26 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 27 } 28 } 29 __syncthreads(); 30 } 31 32 //------------------------------------------------------------------------------ 33 // 1D transpose tensor contraction x 34 //------------------------------------------------------------------------------ 35 template <int NUM_COMP, int P_1D, int Q_1D> 36 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 37 data.slice[data.t_id_x] = *U; 38 __syncthreads(); 39 *V = 0.0; 40 if (data.t_id_x < P_1D) { 41 for (CeedInt i = 0; i < Q_1D; i++) { 42 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 43 } 44 } 45 __syncthreads(); 46 } 47 48 //------------------------------------------------------------------------------ 49 // 1D interpolate to quadrature points 50 //------------------------------------------------------------------------------ 51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 52 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 // 1D interpolate transpose 60 //------------------------------------------------------------------------------ 61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 62 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 63 CeedScalar *__restrict__ r_V) { 64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D derivatives at quadrature points 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 73 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 74 CeedScalar *__restrict__ r_V) { 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 77 } 78 } 79 80 //------------------------------------------------------------------------------ 81 // 1D derivatives transpose 82 //------------------------------------------------------------------------------ 83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 84 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 85 CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D quadrature weights 93 //------------------------------------------------------------------------------ 94 template <int Q_1D> 95 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 96 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 97 } 98 99 //------------------------------------------------------------------------------ 100 // 2D 101 //------------------------------------------------------------------------------ 102 103 //------------------------------------------------------------------------------ 104 // 2D tensor contraction x 105 //------------------------------------------------------------------------------ 106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 107 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 108 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 109 __syncthreads(); 110 *V = 0.0; 111 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 112 for (CeedInt i = 0; i < P_1D; i++) { 113 *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 114 } 115 } 116 __syncthreads(); 117 } 118 119 //------------------------------------------------------------------------------ 120 // 2D tensor contract y 121 //------------------------------------------------------------------------------ 122 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 123 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 124 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 125 __syncthreads(); 126 *V = 0.0; 127 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 128 for (CeedInt i = 0; i < P_1D; i++) { 129 *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 130 } 131 } 132 __syncthreads(); 133 } 134 135 //------------------------------------------------------------------------------ 136 // 2D transpose tensor contract y 137 //------------------------------------------------------------------------------ 138 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 139 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 140 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 141 __syncthreads(); 142 *V = 0.0; 143 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 144 for (CeedInt i = 0; i < Q_1D; i++) { 145 *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 146 } 147 } 148 __syncthreads(); 149 } 150 151 //------------------------------------------------------------------------------ 152 // 2D transpose tensor contract x 153 //------------------------------------------------------------------------------ 154 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 155 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 156 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 157 __syncthreads(); 158 *V = 0.0; 159 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 160 for (CeedInt i = 0; i < Q_1D; i++) { 161 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 162 } 163 } 164 __syncthreads(); 165 } 166 167 //------------------------------------------------------------------------------ 168 // 2D transpose tensor contract and add x 169 //------------------------------------------------------------------------------ 170 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 171 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 172 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 173 __syncthreads(); 174 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 175 for (CeedInt i = 0; i < Q_1D; i++) { 176 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 177 } 178 } 179 __syncthreads(); 180 } 181 182 //------------------------------------------------------------------------------ 183 // 2D interpolate to quadrature points 184 //------------------------------------------------------------------------------ 185 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 186 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 187 CeedScalar r_t[1]; 188 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 189 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 190 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 191 } 192 } 193 194 //------------------------------------------------------------------------------ 195 // 2D interpolate transpose 196 //------------------------------------------------------------------------------ 197 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 198 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 199 CeedScalar *__restrict__ r_V) { 200 CeedScalar r_t[1]; 201 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 202 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 203 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 204 } 205 } 206 207 //------------------------------------------------------------------------------ 208 // 2D derivatives at quadrature points 209 //------------------------------------------------------------------------------ 210 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 211 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 212 CeedScalar *__restrict__ r_V) { 213 CeedScalar r_t[1]; 214 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 215 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t); 216 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 217 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 218 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 219 } 220 } 221 222 //------------------------------------------------------------------------------ 223 // 2D derivatives transpose 224 //------------------------------------------------------------------------------ 225 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 226 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 227 CeedScalar *__restrict__ r_V) { 228 CeedScalar r_t[1]; 229 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 230 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 231 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]); 232 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 233 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 234 } 235 } 236 237 //------------------------------------------------------------------------------ 238 // 2D quadrature weights 239 //------------------------------------------------------------------------------ 240 template <int Q_1D> 241 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 242 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 243 } 244 245 //------------------------------------------------------------------------------ 246 // 3D 247 //------------------------------------------------------------------------------ 248 249 //------------------------------------------------------------------------------ 250 // 3D tensor contract x 251 //------------------------------------------------------------------------------ 252 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 253 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 254 CeedScalar r_B[P_1D]; 255 for (CeedInt i = 0; i < P_1D; i++) { 256 r_B[i] = B[i + data.t_id_x * P_1D]; 257 } 258 259 for (CeedInt k = 0; k < P_1D; k++) { 260 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 261 __syncthreads(); 262 V[k] = 0.0; 263 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 264 for (CeedInt i = 0; i < P_1D; i++) { 265 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 266 } 267 } 268 __syncthreads(); 269 } 270 } 271 272 //------------------------------------------------------------------------------ 273 // 3D tensor contract y 274 //------------------------------------------------------------------------------ 275 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 276 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 277 CeedScalar r_B[P_1D]; 278 for (CeedInt i = 0; i < P_1D; i++) { 279 r_B[i] = B[i + data.t_id_y * P_1D]; 280 } 281 282 for (CeedInt k = 0; k < P_1D; k++) { 283 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 284 __syncthreads(); 285 V[k] = 0.0; 286 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 287 for (CeedInt i = 0; i < P_1D; i++) { 288 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 289 } 290 } 291 __syncthreads(); 292 } 293 } 294 295 //------------------------------------------------------------------------------ 296 // 3D tensor contract z 297 //------------------------------------------------------------------------------ 298 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 299 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 300 for (CeedInt k = 0; k < Q_1D; k++) { 301 V[k] = 0.0; 302 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 303 for (CeedInt i = 0; i < P_1D; i++) { 304 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 305 } 306 } 307 } 308 } 309 310 //------------------------------------------------------------------------------ 311 // 3D transpose tensor contract z 312 //------------------------------------------------------------------------------ 313 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 314 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 315 for (CeedInt k = 0; k < P_1D; k++) { 316 V[k] = 0.0; 317 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 318 for (CeedInt i = 0; i < Q_1D; i++) { 319 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 320 } 321 } 322 } 323 } 324 325 //------------------------------------------------------------------------------ 326 // 3D transpose tensor contract y 327 //------------------------------------------------------------------------------ 328 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 329 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 330 CeedScalar r_B[Q_1D]; 331 for (CeedInt i = 0; i < Q_1D; i++) { 332 r_B[i] = B[data.t_id_y + i * P_1D]; 333 } 334 335 for (CeedInt k = 0; k < P_1D; k++) { 336 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 337 __syncthreads(); 338 V[k] = 0.0; 339 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 340 for (CeedInt i = 0; i < Q_1D; i++) { 341 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 342 } 343 } 344 __syncthreads(); 345 } 346 } 347 348 //------------------------------------------------------------------------------ 349 // 3D transpose tensor contract y 350 //------------------------------------------------------------------------------ 351 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 352 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 353 CeedScalar r_B[Q_1D]; 354 for (CeedInt i = 0; i < Q_1D; i++) { 355 r_B[i] = B[data.t_id_y + i * P_1D]; 356 } 357 358 for (CeedInt k = 0; k < P_1D; k++) { 359 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 360 __syncthreads(); 361 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 362 for (CeedInt i = 0; i < Q_1D; i++) { 363 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 364 } 365 } 366 __syncthreads(); 367 } 368 } 369 370 //------------------------------------------------------------------------------ 371 // 3D transpose tensor contract x 372 //------------------------------------------------------------------------------ 373 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 374 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 375 CeedScalar r_B[Q_1D]; 376 for (CeedInt i = 0; i < Q_1D; i++) { 377 r_B[i] = B[data.t_id_x + i * P_1D]; 378 } 379 380 for (CeedInt k = 0; k < P_1D; k++) { 381 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 382 __syncthreads(); 383 V[k] = 0.0; 384 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 385 for (CeedInt i = 0; i < Q_1D; i++) { 386 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 387 } 388 } 389 __syncthreads(); 390 } 391 } 392 393 //------------------------------------------------------------------------------ 394 // 3D transpose tensor contract add x 395 //------------------------------------------------------------------------------ 396 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 397 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 398 CeedScalar r_B[Q_1D]; 399 for (CeedInt i = 0; i < Q_1D; i++) { 400 r_B[i] = B[data.t_id_x + i * P_1D]; 401 } 402 403 for (CeedInt k = 0; k < P_1D; k++) { 404 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 405 __syncthreads(); 406 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 407 for (CeedInt i = 0; i < Q_1D; i++) { 408 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 409 } 410 } 411 __syncthreads(); 412 } 413 } 414 415 //------------------------------------------------------------------------------ 416 // 3D interpolate to quadrature points 417 //------------------------------------------------------------------------------ 418 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 419 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 420 CeedScalar r_t1[T_1D]; 421 CeedScalar r_t2[T_1D]; 422 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 423 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 424 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 425 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 426 } 427 } 428 429 //------------------------------------------------------------------------------ 430 // 3D interpolate transpose 431 //------------------------------------------------------------------------------ 432 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 433 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 434 CeedScalar *__restrict__ r_V) { 435 CeedScalar r_t1[T_1D]; 436 CeedScalar r_t2[T_1D]; 437 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 438 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 439 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 440 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 441 } 442 } 443 444 //------------------------------------------------------------------------------ 445 // 3D derivatives at quadrature points 446 //------------------------------------------------------------------------------ 447 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 448 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 449 CeedScalar *__restrict__ r_V) { 450 CeedScalar r_t1[T_1D]; 451 CeedScalar r_t2[T_1D]; 452 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 453 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 454 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 455 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 456 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 457 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 458 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 459 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 460 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 461 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 462 } 463 } 464 465 //------------------------------------------------------------------------------ 466 // 3D derivatives transpose 467 //------------------------------------------------------------------------------ 468 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 469 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 470 CeedScalar *__restrict__ r_V) { 471 CeedScalar r_t1[T_1D]; 472 CeedScalar r_t2[T_1D]; 473 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 474 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 475 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 476 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 477 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 478 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 479 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 480 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 481 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 482 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 483 } 484 } 485 486 //------------------------------------------------------------------------------ 487 // 3D derivatives at quadrature points 488 //------------------------------------------------------------------------------ 489 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 490 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 491 CeedScalar *__restrict__ r_V) { 492 CeedScalar r_t1[T_1D]; 493 CeedScalar r_t2[T_1D]; 494 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 495 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 496 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 497 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 498 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 499 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 500 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 501 } 502 } 503 504 //------------------------------------------------------------------------------ 505 // 3D derivatives transpose 506 //------------------------------------------------------------------------------ 507 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 508 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 509 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 510 CeedScalar r_t1[T_1D]; 511 CeedScalar r_t2[T_1D]; 512 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 513 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 514 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 515 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 516 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 517 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 518 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 519 } 520 } 521 522 //------------------------------------------------------------------------------ 523 // 3D quadrature weights 524 //------------------------------------------------------------------------------ 525 template <int Q_1D> 526 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 527 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 528 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 529 for (CeedInt q = 0; q < Q_1D; q++) { 530 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 531 } 532 } 533