1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D> 20 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 21 data.slice[data.t_id_x] = *U; 22 __syncthreads(); 23 *V = 0.0; 24 if (data.t_id_x < Q_1D) { 25 for (CeedInt i = 0; i < P_1D; i++) { 26 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 27 } 28 } 29 __syncthreads(); 30 } 31 32 //------------------------------------------------------------------------------ 33 // 1D transpose tensor contraction x 34 //------------------------------------------------------------------------------ 35 template <int NUM_COMP, int P_1D, int Q_1D> 36 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 37 data.slice[data.t_id_x] = *U; 38 __syncthreads(); 39 *V = 0.0; 40 if (data.t_id_x < P_1D) { 41 for (CeedInt i = 0; i < Q_1D; i++) { 42 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 43 } 44 } 45 __syncthreads(); 46 } 47 48 //------------------------------------------------------------------------------ 49 // 1D interpolate to quadrature points 50 //------------------------------------------------------------------------------ 51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 52 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 // 1D interpolate transpose 60 //------------------------------------------------------------------------------ 61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 62 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 63 CeedScalar *__restrict__ r_V) { 64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D derivatives at quadrature points 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 73 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 74 CeedScalar *__restrict__ r_V) { 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 77 } 78 } 79 80 //------------------------------------------------------------------------------ 81 // 1D derivatives transpose 82 //------------------------------------------------------------------------------ 83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 84 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 85 CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D quadrature weights 93 //------------------------------------------------------------------------------ 94 template <int Q_1D> 95 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 96 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 97 } 98 99 //------------------------------------------------------------------------------ 100 // 2D 101 //------------------------------------------------------------------------------ 102 103 //------------------------------------------------------------------------------ 104 // 2D tensor contraction x 105 //------------------------------------------------------------------------------ 106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 107 inline __device__ void ContractX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 108 CeedScalar *V) { 109 data.slice[t_id_x + t_id_y * T_1D] = *U; 110 __syncthreads(); 111 *V = 0.0; 112 if (t_id_x < Q_1D && t_id_y < P_1D) { 113 for (CeedInt i = 0; i < P_1D; i++) { 114 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 115 } 116 } 117 __syncthreads(); 118 } 119 120 //------------------------------------------------------------------------------ 121 // 2D tensor contract y 122 //------------------------------------------------------------------------------ 123 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 124 inline __device__ void ContractY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 125 CeedScalar *V) { 126 data.slice[t_id_x + t_id_y * T_1D] = *U; 127 >>>>>>> b855402d (gpu - isolate core 2D tensor logic to allow flat version) 128 __syncthreads(); 129 *V = 0.0; 130 if (t_id_x < Q_1D && t_id_y < Q_1D) { 131 for (CeedInt i = 0; i < P_1D; i++) { 132 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 133 } 134 } 135 __syncthreads(); 136 } 137 138 //------------------------------------------------------------------------------ 139 // 2D transpose tensor contract y 140 //------------------------------------------------------------------------------ 141 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 142 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 143 CeedScalar *V) { 144 data.slice[t_id_x + t_id_y * T_1D] = *U; 145 __syncthreads(); 146 *V = 0.0; 147 if (t_id_x < Q_1D && t_id_y < P_1D) { 148 for (CeedInt i = 0; i < Q_1D; i++) { 149 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 150 } 151 } 152 __syncthreads(); 153 } 154 155 //------------------------------------------------------------------------------ 156 // 2D transpose tensor contract x 157 //------------------------------------------------------------------------------ 158 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 159 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 160 CeedScalar *V) { 161 data.slice[t_id_x + t_id_y * T_1D] = *U; 162 __syncthreads(); 163 *V = 0.0; 164 if (t_id_x < P_1D && t_id_y < P_1D) { 165 for (CeedInt i = 0; i < Q_1D; i++) { 166 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 167 } 168 } 169 __syncthreads(); 170 } 171 172 //------------------------------------------------------------------------------ 173 // 2D transpose tensor contract and add x 174 //------------------------------------------------------------------------------ 175 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 176 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 177 CeedScalar *V) { 178 data.slice[t_id_x + t_id_y * T_1D] = *U; 179 __syncthreads(); 180 if (t_id_x < P_1D && t_id_y < P_1D) { 181 for (CeedInt i = 0; i < Q_1D; i++) { 182 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 183 } 184 } 185 __syncthreads(); 186 } 187 188 //------------------------------------------------------------------------------ 189 // 2D interpolate to quadrature points 190 //------------------------------------------------------------------------------ 191 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 192 inline __device__ void InterpTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 193 const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 194 CeedScalar r_t[1]; 195 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 196 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 197 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 198 } 199 } 200 201 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 202 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 203 InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 204 } 205 206 //------------------------------------------------------------------------------ 207 // 2D interpolate transpose 208 //------------------------------------------------------------------------------ 209 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 210 inline __device__ void InterpTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 211 const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 212 CeedScalar r_t[1]; 213 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 214 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 215 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 216 } 217 } 218 219 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 220 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 221 CeedScalar *__restrict__ r_V) { 222 InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 223 } 224 225 //------------------------------------------------------------------------------ 226 // 2D derivatives at quadrature points 227 //------------------------------------------------------------------------------ 228 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 229 inline __device__ void GradTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 230 const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 231 CeedScalar r_t[1]; 232 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 233 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 234 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 235 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 236 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 237 } 238 } 239 240 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 241 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 242 CeedScalar *__restrict__ r_V) { 243 GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 244 } 245 246 //------------------------------------------------------------------------------ 247 // 2D derivatives transpose 248 //------------------------------------------------------------------------------ 249 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 250 inline __device__ void GradTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 251 const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 252 CeedScalar r_t[1]; 253 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 254 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 255 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 256 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 257 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 258 } 259 } 260 261 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 262 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 263 CeedScalar *__restrict__ r_V) { 264 GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 265 } 266 267 //------------------------------------------------------------------------------ 268 // 2D quadrature weights 269 //------------------------------------------------------------------------------ 270 template <int Q_1D> 271 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 272 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 273 } 274 275 //------------------------------------------------------------------------------ 276 // 3D 277 //------------------------------------------------------------------------------ 278 279 //------------------------------------------------------------------------------ 280 // 3D tensor contract x 281 //------------------------------------------------------------------------------ 282 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 283 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 284 CeedScalar r_B[P_1D]; 285 for (CeedInt i = 0; i < P_1D; i++) { 286 r_B[i] = B[i + data.t_id_x * P_1D]; 287 } 288 289 for (CeedInt k = 0; k < P_1D; k++) { 290 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 291 __syncthreads(); 292 V[k] = 0.0; 293 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 294 for (CeedInt i = 0; i < P_1D; i++) { 295 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 296 } 297 } 298 __syncthreads(); 299 } 300 } 301 302 //------------------------------------------------------------------------------ 303 // 3D tensor contract y 304 //------------------------------------------------------------------------------ 305 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 306 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 307 CeedScalar r_B[P_1D]; 308 for (CeedInt i = 0; i < P_1D; i++) { 309 r_B[i] = B[i + data.t_id_y * P_1D]; 310 } 311 312 for (CeedInt k = 0; k < P_1D; k++) { 313 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 314 __syncthreads(); 315 V[k] = 0.0; 316 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 317 for (CeedInt i = 0; i < P_1D; i++) { 318 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 319 } 320 } 321 __syncthreads(); 322 } 323 } 324 325 //------------------------------------------------------------------------------ 326 // 3D tensor contract z 327 //------------------------------------------------------------------------------ 328 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 329 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 330 for (CeedInt k = 0; k < Q_1D; k++) { 331 V[k] = 0.0; 332 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 333 for (CeedInt i = 0; i < P_1D; i++) { 334 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 335 } 336 } 337 } 338 } 339 340 //------------------------------------------------------------------------------ 341 // 3D transpose tensor contract z 342 //------------------------------------------------------------------------------ 343 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 344 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 345 for (CeedInt k = 0; k < P_1D; k++) { 346 V[k] = 0.0; 347 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 348 for (CeedInt i = 0; i < Q_1D; i++) { 349 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 350 } 351 } 352 } 353 } 354 355 //------------------------------------------------------------------------------ 356 // 3D transpose tensor contract y 357 //------------------------------------------------------------------------------ 358 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 359 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 360 CeedScalar r_B[Q_1D]; 361 for (CeedInt i = 0; i < Q_1D; i++) { 362 r_B[i] = B[data.t_id_y + i * P_1D]; 363 } 364 365 for (CeedInt k = 0; k < P_1D; k++) { 366 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 367 __syncthreads(); 368 V[k] = 0.0; 369 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 370 for (CeedInt i = 0; i < Q_1D; i++) { 371 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 372 } 373 } 374 __syncthreads(); 375 } 376 } 377 378 //------------------------------------------------------------------------------ 379 // 3D transpose tensor contract y 380 //------------------------------------------------------------------------------ 381 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 382 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 383 CeedScalar r_B[Q_1D]; 384 for (CeedInt i = 0; i < Q_1D; i++) { 385 r_B[i] = B[data.t_id_y + i * P_1D]; 386 } 387 388 for (CeedInt k = 0; k < P_1D; k++) { 389 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 390 __syncthreads(); 391 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 392 for (CeedInt i = 0; i < Q_1D; i++) { 393 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 394 } 395 } 396 __syncthreads(); 397 } 398 } 399 400 //------------------------------------------------------------------------------ 401 // 3D transpose tensor contract x 402 //------------------------------------------------------------------------------ 403 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 404 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 405 CeedScalar r_B[Q_1D]; 406 for (CeedInt i = 0; i < Q_1D; i++) { 407 r_B[i] = B[data.t_id_x + i * P_1D]; 408 } 409 410 for (CeedInt k = 0; k < P_1D; k++) { 411 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 412 __syncthreads(); 413 V[k] = 0.0; 414 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 415 for (CeedInt i = 0; i < Q_1D; i++) { 416 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 417 } 418 } 419 __syncthreads(); 420 } 421 } 422 423 //------------------------------------------------------------------------------ 424 // 3D transpose tensor contract add x 425 //------------------------------------------------------------------------------ 426 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 427 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 428 CeedScalar r_B[Q_1D]; 429 for (CeedInt i = 0; i < Q_1D; i++) { 430 r_B[i] = B[data.t_id_x + i * P_1D]; 431 } 432 433 for (CeedInt k = 0; k < P_1D; k++) { 434 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 435 __syncthreads(); 436 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 437 for (CeedInt i = 0; i < Q_1D; i++) { 438 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 439 } 440 } 441 __syncthreads(); 442 } 443 } 444 445 //------------------------------------------------------------------------------ 446 // 3D interpolate to quadrature points 447 //------------------------------------------------------------------------------ 448 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 449 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 450 CeedScalar r_t1[T_1D]; 451 CeedScalar r_t2[T_1D]; 452 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 453 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 454 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 455 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 456 } 457 } 458 459 //------------------------------------------------------------------------------ 460 // 3D interpolate transpose 461 //------------------------------------------------------------------------------ 462 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 463 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 464 CeedScalar *__restrict__ r_V) { 465 CeedScalar r_t1[T_1D]; 466 CeedScalar r_t2[T_1D]; 467 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 468 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 469 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 470 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 471 } 472 } 473 474 //------------------------------------------------------------------------------ 475 // 3D derivatives at quadrature points 476 //------------------------------------------------------------------------------ 477 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 478 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 479 CeedScalar *__restrict__ r_V) { 480 CeedScalar r_t1[T_1D]; 481 CeedScalar r_t2[T_1D]; 482 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 483 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 484 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 485 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 486 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 487 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 488 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 489 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 490 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 491 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 492 } 493 } 494 495 //------------------------------------------------------------------------------ 496 // 3D derivatives transpose 497 //------------------------------------------------------------------------------ 498 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 499 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 500 CeedScalar *__restrict__ r_V) { 501 CeedScalar r_t1[T_1D]; 502 CeedScalar r_t2[T_1D]; 503 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 504 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 505 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 506 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 507 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 508 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 509 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 510 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 511 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 512 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 513 } 514 } 515 516 //------------------------------------------------------------------------------ 517 // 3D derivatives at quadrature points 518 //------------------------------------------------------------------------------ 519 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 520 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 521 CeedScalar *__restrict__ r_V) { 522 CeedScalar r_t1[T_1D]; 523 CeedScalar r_t2[T_1D]; 524 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 525 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 526 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 527 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 528 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 529 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 530 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 531 } 532 } 533 534 //------------------------------------------------------------------------------ 535 // 3D derivatives transpose 536 //------------------------------------------------------------------------------ 537 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 538 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 539 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 540 CeedScalar r_t1[T_1D]; 541 CeedScalar r_t2[T_1D]; 542 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 543 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 544 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 545 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 546 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 547 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 548 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 549 } 550 } 551 552 //------------------------------------------------------------------------------ 553 // 3D quadrature weights 554 //------------------------------------------------------------------------------ 555 template <int Q_1D> 556 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 557 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 558 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 559 for (CeedInt q = 0; q < Q_1D; q++) { 560 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 561 } 562 } 563