1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis templates 10 #ifndef _ceed_hip_shared_basis_tensor_templates_h 11 #define _ceed_hip_shared_basis_tensor_templates_h 12 13 #include <ceed.h> 14 15 //------------------------------------------------------------------------------ 16 // 1D 17 //------------------------------------------------------------------------------ 18 19 //------------------------------------------------------------------------------ 20 // 1D tensor contraction x 21 //------------------------------------------------------------------------------ 22 template <int NUM_COMP, int P_1D, int Q_1D> 23 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 24 data.slice[data.t_id_x] = *U; 25 __syncthreads(); 26 *V = 0.0; 27 if (data.t_id_x < Q_1D) { 28 for (CeedInt i = 0; i < P_1D; i++) { 29 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 30 } 31 } 32 __syncthreads(); 33 } 34 35 //------------------------------------------------------------------------------ 36 // 1D transpose tensor contraction x 37 //------------------------------------------------------------------------------ 38 template <int NUM_COMP, int P_1D, int Q_1D> 39 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 40 data.slice[data.t_id_x] = *U; 41 __syncthreads(); 42 *V = 0.0; 43 if (data.t_id_x < P_1D) { 44 for (CeedInt i = 0; i < Q_1D; i++) { 45 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 46 } 47 } 48 __syncthreads(); 49 } 50 51 //------------------------------------------------------------------------------ 52 // 1D interpolate to quadrature points 53 //------------------------------------------------------------------------------ 54 template <int NUM_COMP, int P_1D, int Q_1D> 55 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 56 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 57 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp); 58 } 59 } 60 61 //------------------------------------------------------------------------------ 62 // 1D interpolate transpose 63 //------------------------------------------------------------------------------ 64 template <int NUM_COMP, int P_1D, int Q_1D> 65 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 66 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 67 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp); 68 } 69 } 70 71 //------------------------------------------------------------------------------ 72 // 1D derivatives at quadrature points 73 //------------------------------------------------------------------------------ 74 template <int NUM_COMP, int P_1D, int Q_1D> 75 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 76 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 77 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp); 78 } 79 } 80 81 //------------------------------------------------------------------------------ 82 // 1D derivatives transpose 83 //------------------------------------------------------------------------------ 84 template <int NUM_COMP, int P_1D, int Q_1D> 85 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp); 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D quadrature weights 93 //------------------------------------------------------------------------------ 94 template <int Q_1D> 95 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 96 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 97 } 98 99 //------------------------------------------------------------------------------ 100 // 2D 101 //------------------------------------------------------------------------------ 102 103 //------------------------------------------------------------------------------ 104 // 2D tensor contraction x 105 //------------------------------------------------------------------------------ 106 template <int NUM_COMP, int P_1D, int Q_1D> 107 inline __device__ void ContractX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 108 data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; 109 __syncthreads(); 110 *V = 0.0; 111 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 112 for (CeedInt i = 0; i < P_1D; i++) { 113 *V += B[i + data.t_id_x*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 114 } 115 } 116 __syncthreads(); 117 } 118 119 //------------------------------------------------------------------------------ 120 // 2D tensor contract y 121 //------------------------------------------------------------------------------ 122 template <int NUM_COMP, int P_1D, int Q_1D> 123 inline __device__ void ContractY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 124 data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; 125 __syncthreads(); 126 *V = 0.0; 127 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 128 for (CeedInt i = 0; i < P_1D; i++) { 129 *V += B[i + data.t_id_y*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction 130 } 131 } 132 __syncthreads(); 133 } 134 135 //------------------------------------------------------------------------------ 136 // 2D transpose tensor contract y 137 //------------------------------------------------------------------------------ 138 template <int NUM_COMP, int P_1D, int Q_1D> 139 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 140 data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; 141 __syncthreads(); 142 *V = 0.0; 143 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 144 for (CeedInt i = 0; i < Q_1D; i++) { 145 *V += B[data.t_id_y + i*P_1D] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction 146 } 147 } 148 __syncthreads(); 149 } 150 151 //------------------------------------------------------------------------------ 152 // 2D transpose tensor contract x 153 //------------------------------------------------------------------------------ 154 template <int NUM_COMP, int P_1D, int Q_1D> 155 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 156 data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; 157 __syncthreads(); 158 *V = 0.0; 159 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 160 for (CeedInt i = 0; i < Q_1D; i++) { 161 *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 162 } 163 } 164 __syncthreads(); 165 } 166 167 //------------------------------------------------------------------------------ 168 // 2D transpose tensor contract and add x 169 //------------------------------------------------------------------------------ 170 template <int NUM_COMP, int P_1D, int Q_1D> 171 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 172 data.slice[data.t_id_x+data.t_id_y*T_1D] = *U; 173 __syncthreads(); 174 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 175 for (CeedInt i = 0; i < Q_1D; i++) { 176 *V += B[data.t_id_x + i*P_1D] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 177 } 178 } 179 __syncthreads(); 180 } 181 182 //------------------------------------------------------------------------------ 183 // 2D interpolate to quadrature points 184 //------------------------------------------------------------------------------ 185 template <int NUM_COMP, int P_1D, int Q_1D> 186 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 187 CeedScalar r_t[1]; 188 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 189 ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t); 190 ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp); 191 } 192 } 193 194 //------------------------------------------------------------------------------ 195 // 2D interpolate transpose 196 //------------------------------------------------------------------------------ 197 template <int NUM_COMP, int P_1D, int Q_1D> 198 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 199 CeedScalar r_t[1]; 200 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 201 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t); 202 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp); 203 } 204 } 205 206 //------------------------------------------------------------------------------ 207 // 2D derivatives at quadrature points 208 //------------------------------------------------------------------------------ 209 template <int NUM_COMP, int P_1D, int Q_1D> 210 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 211 CeedScalar r_t[1]; 212 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 213 ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t); 214 ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0*NUM_COMP); 215 ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t); 216 ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1*NUM_COMP); 217 } 218 } 219 220 //------------------------------------------------------------------------------ 221 // 2D derivatives transpose 222 //------------------------------------------------------------------------------ 223 template <int NUM_COMP, int P_1D, int Q_1D> 224 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 225 CeedScalar r_t[1]; 226 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 227 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0*NUM_COMP, c_B, r_t); 228 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp); 229 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1*NUM_COMP, c_G, r_t); 230 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp); 231 } 232 } 233 234 //------------------------------------------------------------------------------ 235 // 2D quadrature weights 236 //------------------------------------------------------------------------------ 237 template <int Q_1D> 238 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 239 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? 240 q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0; 241 } 242 243 //------------------------------------------------------------------------------ 244 // 3D 245 //------------------------------------------------------------------------------ 246 247 //------------------------------------------------------------------------------ 248 // 3D tensor contract x 249 //------------------------------------------------------------------------------ 250 template <int NUM_COMP, int P_1D, int Q_1D> 251 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 252 CeedScalar r_B[P_1D]; 253 for (CeedInt i = 0; i < P_1D; i++) { 254 r_B[i] = B[i + data.t_id_x*P_1D]; 255 } 256 257 for (CeedInt k = 0; k < P_1D; k++) { 258 data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 259 __syncthreads(); 260 V[k] = 0.0; 261 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 262 for (CeedInt i = 0; i < P_1D; i++) { 263 V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 264 } 265 } 266 __syncthreads(); 267 } 268 } 269 270 //------------------------------------------------------------------------------ 271 // 3D tensor contract y 272 //------------------------------------------------------------------------------ 273 template <int NUM_COMP, int P_1D, int Q_1D> 274 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 275 CeedScalar r_B[P_1D]; 276 for (CeedInt i = 0; i < P_1D; i++) { 277 r_B[i] = B[i + data.t_id_y*P_1D]; 278 } 279 280 for (CeedInt k = 0; k < P_1D; k++) { 281 data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 282 __syncthreads(); 283 V[k] = 0.0; 284 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 285 for (CeedInt i = 0; i < P_1D; i++) { 286 V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction 287 } 288 } 289 __syncthreads(); 290 } 291 } 292 293 //------------------------------------------------------------------------------ 294 // 3D tensor contract z 295 //------------------------------------------------------------------------------ 296 template <int NUM_COMP, int P_1D, int Q_1D> 297 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 298 for (CeedInt k = 0; k < Q_1D; k++) { 299 V[k] = 0.0; 300 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 301 for (CeedInt i = 0; i < P_1D; i++) { 302 V[k] += B[i + k*P_1D] * U[i]; // Contract z direction 303 } 304 } 305 } 306 } 307 308 //------------------------------------------------------------------------------ 309 // 3D transpose tensor contract z 310 //------------------------------------------------------------------------------ 311 template <int NUM_COMP, int P_1D, int Q_1D> 312 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 313 for (CeedInt k = 0; k < P_1D; k++) { 314 V[k] = 0.0; 315 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 316 for (CeedInt i = 0; i < Q_1D; i++) { 317 V[k] += B[k + i*P_1D] * U[i]; // Contract z direction 318 } 319 } 320 } 321 } 322 323 //------------------------------------------------------------------------------ 324 // 3D transpose tensor contract y 325 //------------------------------------------------------------------------------ 326 template <int NUM_COMP, int P_1D, int Q_1D> 327 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 328 CeedScalar r_B[Q_1D]; 329 for (CeedInt i = 0; i < Q_1D; i++) { 330 r_B[i] = B[data.t_id_y + i*P_1D]; 331 } 332 333 for (CeedInt k = 0; k < P_1D; k++) { 334 data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 335 __syncthreads(); 336 V[k] = 0.0; 337 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 338 for (CeedInt i = 0; i < Q_1D; i++) { 339 V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction 340 } 341 } 342 __syncthreads(); 343 } 344 } 345 346 //------------------------------------------------------------------------------ 347 // 3D transpose tensor contract y 348 //------------------------------------------------------------------------------ 349 template <int NUM_COMP, int P_1D, int Q_1D> 350 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 351 CeedScalar r_B[Q_1D]; 352 for (CeedInt i = 0; i < Q_1D; i++) { 353 r_B[i] = B[data.t_id_y + i*P_1D]; 354 } 355 356 for (CeedInt k = 0; k < P_1D; k++) { 357 data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 358 __syncthreads(); 359 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 360 for (CeedInt i = 0; i < Q_1D; i++) { 361 V[k] += r_B[i] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction 362 } 363 } 364 __syncthreads(); 365 } 366 } 367 368 //------------------------------------------------------------------------------ 369 // 3D transpose tensor contract x 370 //------------------------------------------------------------------------------ 371 template <int NUM_COMP, int P_1D, int Q_1D> 372 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 373 CeedScalar r_B[Q_1D]; 374 for (CeedInt i = 0; i < Q_1D; i++) { 375 r_B[i] = B[data.t_id_x + i*P_1D]; 376 } 377 378 for (CeedInt k = 0; k < P_1D; k++) { 379 data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 380 __syncthreads(); 381 V[k] = 0.0; 382 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 383 for (CeedInt i = 0; i < Q_1D; i++) { 384 V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 385 } 386 } 387 __syncthreads(); 388 } 389 } 390 391 //------------------------------------------------------------------------------ 392 // 3D transpose tensor contract add x 393 //------------------------------------------------------------------------------ 394 template <int NUM_COMP, int P_1D, int Q_1D> 395 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 396 CeedScalar r_B[Q_1D]; 397 for (CeedInt i = 0; i < Q_1D; i++) { 398 r_B[i] = B[data.t_id_x + i*P_1D]; 399 } 400 401 for (CeedInt k = 0; k < P_1D; k++) { 402 data.slice[data.t_id_x+data.t_id_y*T_1D] = U[k]; 403 __syncthreads(); 404 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 405 for (CeedInt i = 0; i < Q_1D; i++) { 406 V[k] += r_B[i] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction 407 } 408 } 409 __syncthreads(); 410 } 411 } 412 413 //------------------------------------------------------------------------------ 414 // 3D interpolate to quadrature points 415 //------------------------------------------------------------------------------ 416 template <int NUM_COMP, int P_1D, int Q_1D> 417 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 418 CeedScalar r_t1[T_1D]; 419 CeedScalar r_t2[T_1D]; 420 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 421 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1); 422 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 423 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D); 424 } 425 } 426 427 //------------------------------------------------------------------------------ 428 // 3D interpolate transpose 429 //------------------------------------------------------------------------------ 430 template <int NUM_COMP, int P_1D, int Q_1D> 431 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 432 CeedScalar r_t1[T_1D]; 433 CeedScalar r_t2[T_1D]; 434 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 435 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D, c_B, r_t1); 436 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 437 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D); 438 } 439 } 440 441 //------------------------------------------------------------------------------ 442 // 3D derivatives at quadrature points 443 //------------------------------------------------------------------------------ 444 template <int NUM_COMP, int P_1D, int Q_1D> 445 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 446 CeedScalar r_t1[T_1D]; 447 CeedScalar r_t2[T_1D]; 448 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 449 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_G, r_t1); 450 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 451 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D); 452 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1); 453 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2); 454 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D); 455 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1); 456 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 457 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D); 458 } 459 } 460 461 //------------------------------------------------------------------------------ 462 // 3D derivatives transpose 463 //------------------------------------------------------------------------------ 464 template <int NUM_COMP, int P_1D, int Q_1D> 465 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 466 CeedScalar r_t1[T_1D]; 467 CeedScalar r_t2[T_1D]; 468 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 469 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_B, r_t1); 470 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 471 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp*P_1D); 472 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_B, r_t1); 473 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2); 474 ContractTransposeAddX3d<NUM_COMP,P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D); 475 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t1); 476 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 477 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D); 478 } 479 } 480 481 //------------------------------------------------------------------------------ 482 // 3D derivatives at quadrature points 483 //------------------------------------------------------------------------------ 484 template <int NUM_COMP, int P_1D, int Q_1D> 485 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 486 CeedScalar r_t1[T_1D]; 487 CeedScalar r_t2[T_1D]; 488 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 489 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp*P_1D, c_B, r_t1); 490 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 491 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1); 492 ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 0*NUM_COMP*Q_1D); 493 ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 1*NUM_COMP*Q_1D); 494 ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp*Q_1D + 2*NUM_COMP*Q_1D); 495 } 496 } 497 498 //------------------------------------------------------------------------------ 499 // 3D derivatives transpose 500 //------------------------------------------------------------------------------ 501 template <int NUM_COMP, int P_1D, int Q_1D> 502 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 503 CeedScalar r_t1[T_1D]; 504 CeedScalar r_t2[T_1D]; 505 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 506 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 2*NUM_COMP*Q_1D, c_G, r_t2); 507 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 1*NUM_COMP*Q_1D, c_G, r_t2); 508 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp*Q_1D + 0*NUM_COMP*Q_1D, c_G, r_t2); 509 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1); 510 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 511 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp*P_1D); 512 } 513 } 514 515 //------------------------------------------------------------------------------ 516 // 3D quadrature weights 517 //------------------------------------------------------------------------------ 518 template <int Q_1D> 519 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 520 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 521 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x]*q_weight_1d[data.t_id_y] : 0.0; 522 for (CeedInt q = 0; q < Q_1D; q++) { 523 w[q] = quad ? pw*q_weight_1d[q] : 0.0; 524 } 525 } 526 527 //------------------------------------------------------------------------------ 528 529 #endif 530