1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for SYCL shared memory tensor product basis templates 10 #ifndef _ceed_sycl_shared_basis_tensor_templates_h 11 #define _ceed_sycl_shared_basis_tensor_templates_h 12 13 #include <ceed.h> 14 15 //------------------------------------------------------------------------------ 16 // 1D 17 //------------------------------------------------------------------------------ 18 19 //------------------------------------------------------------------------------ 20 // 1D tensor contraction x 21 //------------------------------------------------------------------------------ 22 inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 23 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 24 const CeedInt item_id_x = get_local_id(0); 25 26 scratch[item_id_x] = *U; 27 work_group_barrier(CLK_LOCAL_MEM_FENCE); 28 29 *V = 0.0; 30 if (item_id_x < Q_1D) { 31 for (CeedInt i = 0; i < P_1D; i++) { 32 *V += B[i + item_id_x * P_1D] * scratch[i]; // Contract x direction 33 } 34 } 35 work_group_barrier(CLK_LOCAL_MEM_FENCE); 36 } 37 38 //------------------------------------------------------------------------------ 39 // 1D transpose tensor contraction x 40 //------------------------------------------------------------------------------ 41 inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 42 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 43 const CeedInt item_id_x = get_local_id(0); 44 45 scratch[item_id_x] = *U; 46 work_group_barrier(CLK_LOCAL_MEM_FENCE); 47 48 *V = 0.0; 49 if (item_id_x < P_1D) { 50 for (CeedInt i = 0; i < Q_1D; i++) { 51 *V += B[item_id_x + i * P_1D] * scratch[i]; // Contract x direction 52 } 53 } 54 work_group_barrier(CLK_LOCAL_MEM_FENCE); 55 } 56 57 //------------------------------------------------------------------------------ 58 // 1D interpolate to quadrature points 59 //------------------------------------------------------------------------------ 60 inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 61 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 62 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 63 ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 64 } 65 } 66 67 //------------------------------------------------------------------------------ 68 // 1D interpolate transpose 69 //------------------------------------------------------------------------------ 70 inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 71 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 72 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 73 ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 74 } 75 } 76 77 //------------------------------------------------------------------------------ 78 // 1D derivatives at quadrature points 79 //------------------------------------------------------------------------------ 80 inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 81 local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 82 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 83 ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 84 } 85 } 86 87 //------------------------------------------------------------------------------ 88 // 1D derivatives transpose 89 //------------------------------------------------------------------------------ 90 inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 91 local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 92 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 93 ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 94 } 95 } 96 97 //------------------------------------------------------------------------------ 98 // 1D quadrature weights 99 //------------------------------------------------------------------------------ 100 inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 101 const CeedInt item_id_x = get_local_id(0); 102 *w = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0; 103 } 104 105 //------------------------------------------------------------------------------ 106 // 2D 107 //------------------------------------------------------------------------------ 108 109 //------------------------------------------------------------------------------ 110 // 2D tensor contraction x 111 //------------------------------------------------------------------------------ 112 inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 113 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 114 const CeedInt item_id_x = get_local_id(0); 115 const CeedInt item_id_y = get_local_id(1); 116 117 scratch[item_id_x + item_id_y * T_1D] = *U; 118 work_group_barrier(CLK_LOCAL_MEM_FENCE); 119 120 *V = 0.0; 121 if (item_id_x < Q_1D && item_id_y < P_1D) { 122 for (CeedInt i = 0; i < P_1D; i++) { 123 *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 124 } 125 } 126 work_group_barrier(CLK_LOCAL_MEM_FENCE); 127 } 128 129 //------------------------------------------------------------------------------ 130 // 2D tensor contract y 131 //------------------------------------------------------------------------------ 132 inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 133 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 134 const CeedInt item_id_x = get_local_id(0); 135 const CeedInt item_id_y = get_local_id(1); 136 137 scratch[item_id_x + item_id_y * T_1D] = *U; 138 work_group_barrier(CLK_LOCAL_MEM_FENCE); 139 140 *V = 0.0; 141 if (item_id_x < Q_1D && item_id_y < Q_1D) { 142 for (CeedInt i = 0; i < P_1D; i++) { 143 *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 144 } 145 } 146 work_group_barrier(CLK_LOCAL_MEM_FENCE); 147 } 148 149 //------------------------------------------------------------------------------ 150 // 2D transpose tensor contract y 151 //------------------------------------------------------------------------------ 152 inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 153 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 154 const CeedInt item_id_x = get_local_id(0); 155 const CeedInt item_id_y = get_local_id(1); 156 157 scratch[item_id_x + item_id_y * T_1D] = *U; 158 work_group_barrier(CLK_LOCAL_MEM_FENCE); 159 160 *V = 0.0; 161 if (item_id_x < Q_1D && item_id_y < P_1D) { 162 for (CeedInt i = 0; i < Q_1D; i++) { 163 *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 164 } 165 } 166 work_group_barrier(CLK_LOCAL_MEM_FENCE); 167 } 168 169 //------------------------------------------------------------------------------ 170 // 2D transpose tensor contract x 171 //------------------------------------------------------------------------------ 172 inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 173 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 174 const CeedInt item_id_x = get_local_id(0); 175 const CeedInt item_id_y = get_local_id(1); 176 177 scratch[item_id_x + item_id_y * T_1D] = *U; 178 work_group_barrier(CLK_LOCAL_MEM_FENCE); 179 180 *V = 0.0; 181 if (item_id_x < P_1D && item_id_y < P_1D) { 182 for (CeedInt i = 0; i < Q_1D; i++) { 183 *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 184 } 185 } 186 work_group_barrier(CLK_LOCAL_MEM_FENCE); 187 } 188 189 //------------------------------------------------------------------------------ 190 // 2D transpose tensor contract and add x 191 //------------------------------------------------------------------------------ 192 inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 193 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 194 const CeedInt item_id_x = get_local_id(0); 195 const CeedInt item_id_y = get_local_id(1); 196 197 scratch[item_id_x + item_id_y * T_1D] = *U; 198 work_group_barrier(CLK_LOCAL_MEM_FENCE); 199 200 if (item_id_x < P_1D && item_id_y < P_1D) { 201 for (CeedInt i = 0; i < Q_1D; i++) { 202 *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 203 } 204 } 205 work_group_barrier(CLK_LOCAL_MEM_FENCE); 206 } 207 208 //------------------------------------------------------------------------------ 209 // 2D interpolate to quadrature points 210 //------------------------------------------------------------------------------ 211 inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 212 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 213 CeedScalar r_t[1]; 214 215 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 216 ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 217 ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 218 } 219 } 220 221 //------------------------------------------------------------------------------ 222 // 2D interpolate transpose 223 //------------------------------------------------------------------------------ 224 inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 225 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 226 CeedScalar r_t[1]; 227 228 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 229 ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 230 ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 231 } 232 } 233 234 //------------------------------------------------------------------------------ 235 // 2D derivatives at quadrature points 236 //------------------------------------------------------------------------------ 237 inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 238 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 239 local CeedScalar *restrict scratch) { 240 CeedScalar r_t[1]; 241 242 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 243 ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch); 244 ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch); 245 ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 246 ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch); 247 } 248 } 249 250 //------------------------------------------------------------------------------ 251 // 2D derivatives transpose 252 //------------------------------------------------------------------------------ 253 inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 254 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 255 local CeedScalar *restrict scratch) { 256 CeedScalar r_t[1]; 257 258 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 259 ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch); 260 ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch); 261 ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch); 262 ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 263 } 264 } 265 266 //------------------------------------------------------------------------------ 267 // 2D quadrature weights 268 //------------------------------------------------------------------------------ 269 inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 270 const CeedInt item_id_x = get_local_id(0); 271 const CeedInt item_id_y = get_local_id(1); 272 273 *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0; 274 } 275 276 //------------------------------------------------------------------------------ 277 // 3D 278 //------------------------------------------------------------------------------ 279 280 //------------------------------------------------------------------------------ 281 // 3D tensor contract x 282 //------------------------------------------------------------------------------ 283 inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 284 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 285 const CeedInt item_id_x = get_local_id(0); 286 const CeedInt item_id_y = get_local_id(1); 287 288 CeedScalar r_B[T_1D]; 289 for (CeedInt i = 0; i < P_1D; i++) { 290 r_B[i] = B[i + item_id_x * P_1D]; 291 } 292 293 for (CeedInt k = 0; k < P_1D; k++) { 294 scratch[item_id_x + item_id_y * T_1D] = U[k]; 295 work_group_barrier(CLK_LOCAL_MEM_FENCE); 296 297 V[k] = 0.0; 298 if (item_id_x < Q_1D && item_id_y < P_1D) { 299 for (CeedInt i = 0; i < P_1D; i++) { 300 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 301 } 302 } 303 work_group_barrier(CLK_LOCAL_MEM_FENCE); 304 } 305 } 306 307 //------------------------------------------------------------------------------ 308 // 3D tensor contract y 309 //------------------------------------------------------------------------------ 310 inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 311 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 312 const CeedInt item_id_x = get_local_id(0); 313 const CeedInt item_id_y = get_local_id(1); 314 315 CeedScalar r_B[T_1D]; 316 for (CeedInt i = 0; i < P_1D; i++) { 317 r_B[i] = B[i + item_id_y * P_1D]; 318 } 319 320 for (CeedInt k = 0; k < P_1D; k++) { 321 scratch[item_id_x + item_id_y * T_1D] = U[k]; 322 work_group_barrier(CLK_LOCAL_MEM_FENCE); 323 324 V[k] = 0.0; 325 if (item_id_x < Q_1D && item_id_y < Q_1D) { 326 for (CeedInt i = 0; i < P_1D; i++) { 327 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 328 } 329 } 330 work_group_barrier(CLK_LOCAL_MEM_FENCE); 331 } 332 } 333 334 //------------------------------------------------------------------------------ 335 // 3D tensor contract z 336 //------------------------------------------------------------------------------ 337 inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 338 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 339 const CeedInt item_id_x = get_local_id(0); 340 const CeedInt item_id_y = get_local_id(1); 341 342 for (CeedInt k = 0; k < Q_1D; k++) { 343 V[k] = 0.0; 344 if (item_id_x < Q_1D && item_id_y < Q_1D) { 345 for (CeedInt i = 0; i < P_1D; i++) { 346 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 347 } 348 } 349 } 350 } 351 352 //------------------------------------------------------------------------------ 353 // 3D transpose tensor contract z 354 //------------------------------------------------------------------------------ 355 inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 356 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 357 const CeedInt item_id_x = get_local_id(0); 358 const CeedInt item_id_y = get_local_id(1); 359 360 for (CeedInt k = 0; k < P_1D; k++) { 361 V[k] = 0.0; 362 if (item_id_x < Q_1D && item_id_y < Q_1D) { 363 for (CeedInt i = 0; i < Q_1D; i++) { 364 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 365 } 366 } 367 } 368 } 369 370 //------------------------------------------------------------------------------ 371 // 3D transpose tensor contract y 372 //------------------------------------------------------------------------------ 373 inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 374 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 375 const CeedInt item_id_x = get_local_id(0); 376 const CeedInt item_id_y = get_local_id(1); 377 378 CeedScalar r_B[T_1D]; 379 for (CeedInt i = 0; i < Q_1D; i++) { 380 r_B[i] = B[item_id_y + i * P_1D]; 381 } 382 383 for (CeedInt k = 0; k < P_1D; k++) { 384 scratch[item_id_x + item_id_y * T_1D] = U[k]; 385 work_group_barrier(CLK_LOCAL_MEM_FENCE); 386 387 V[k] = 0.0; 388 if (item_id_x < Q_1D && item_id_y < P_1D) { 389 for (CeedInt i = 0; i < Q_1D; i++) { 390 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 391 } 392 } 393 work_group_barrier(CLK_LOCAL_MEM_FENCE); 394 } 395 } 396 397 //------------------------------------------------------------------------------ 398 // 3D transpose tensor contract y 399 //------------------------------------------------------------------------------ 400 inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 401 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 402 const CeedInt item_id_x = get_local_id(0); 403 const CeedInt item_id_y = get_local_id(1); 404 405 CeedScalar r_B[T_1D]; 406 for (CeedInt i = 0; i < Q_1D; i++) { 407 r_B[i] = B[item_id_y + i * P_1D]; 408 } 409 410 for (CeedInt k = 0; k < P_1D; k++) { 411 scratch[item_id_x + item_id_y * T_1D] = U[k]; 412 work_group_barrier(CLK_LOCAL_MEM_FENCE); 413 if (item_id_x < Q_1D && item_id_y < P_1D) { 414 for (CeedInt i = 0; i < Q_1D; i++) { 415 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 416 } 417 } 418 work_group_barrier(CLK_LOCAL_MEM_FENCE); 419 } 420 } 421 422 //------------------------------------------------------------------------------ 423 // 3D transpose tensor contract x 424 //------------------------------------------------------------------------------ 425 inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 426 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 427 const CeedInt item_id_x = get_local_id(0); 428 const CeedInt item_id_y = get_local_id(1); 429 430 CeedScalar r_B[T_1D]; 431 for (CeedInt i = 0; i < Q_1D; i++) { 432 r_B[i] = B[item_id_x + i * P_1D]; 433 } 434 435 for (CeedInt k = 0; k < P_1D; k++) { 436 scratch[item_id_x + item_id_y * T_1D] = U[k]; 437 work_group_barrier(CLK_LOCAL_MEM_FENCE); 438 V[k] = 0.0; 439 if (item_id_x < P_1D && item_id_y < P_1D) { 440 for (CeedInt i = 0; i < Q_1D; i++) { 441 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 442 } 443 } 444 work_group_barrier(CLK_LOCAL_MEM_FENCE); 445 } 446 } 447 448 //------------------------------------------------------------------------------ 449 // 3D transpose tensor contract add x 450 //------------------------------------------------------------------------------ 451 inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 452 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 453 const CeedInt item_id_x = get_local_id(0); 454 const CeedInt item_id_y = get_local_id(1); 455 456 CeedScalar r_B[T_1D]; 457 for (CeedInt i = 0; i < Q_1D; i++) { 458 r_B[i] = B[item_id_x + i * P_1D]; 459 } 460 461 for (CeedInt k = 0; k < P_1D; k++) { 462 scratch[item_id_x + item_id_y * T_1D] = U[k]; 463 work_group_barrier(CLK_LOCAL_MEM_FENCE); 464 465 if (item_id_x < P_1D && item_id_y < P_1D) { 466 for (CeedInt i = 0; i < Q_1D; i++) { 467 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 468 } 469 } 470 work_group_barrier(CLK_LOCAL_MEM_FENCE); 471 } 472 } 473 474 //------------------------------------------------------------------------------ 475 // 3D interpolate to quadrature points 476 //------------------------------------------------------------------------------ 477 inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 478 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 479 CeedScalar r_t1[T_1D]; 480 CeedScalar r_t2[T_1D]; 481 482 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 483 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 484 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 485 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch); 486 } 487 } 488 489 //------------------------------------------------------------------------------ 490 // 3D interpolate transpose 491 //------------------------------------------------------------------------------ 492 inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 493 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 494 CeedScalar r_t1[T_1D]; 495 CeedScalar r_t2[T_1D]; 496 497 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 498 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch); 499 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 500 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 501 } 502 } 503 504 //------------------------------------------------------------------------------ 505 // 3D derivatives at quadrature points 506 //------------------------------------------------------------------------------ 507 inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 508 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 509 local CeedScalar *restrict scratch) { 510 CeedScalar r_t1[T_1D]; 511 CeedScalar r_t2[T_1D]; 512 513 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 514 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch); 515 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 516 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 517 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 518 ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 519 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 520 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 521 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 522 ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 523 } 524 } 525 526 //------------------------------------------------------------------------------ 527 // 3D derivatives transpose 528 //------------------------------------------------------------------------------ 529 inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 530 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 531 local CeedScalar *restrict scratch) { 532 CeedScalar r_t1[T_1D]; 533 CeedScalar r_t2[T_1D]; 534 535 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 536 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 537 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 538 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch); 539 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 540 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 541 ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 542 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch); 543 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 544 ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 545 } 546 } 547 548 //------------------------------------------------------------------------------ 549 // 3D derivatives at quadrature points 550 //------------------------------------------------------------------------------ 551 inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 552 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 553 local CeedScalar *restrict scratch) { 554 CeedScalar r_t1[T_1D]; 555 CeedScalar r_t2[T_1D]; 556 557 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 558 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 559 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 560 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 561 ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 562 ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 563 ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 564 } 565 } 566 567 //------------------------------------------------------------------------------ 568 // 3D derivatives transpose 569 //------------------------------------------------------------------------------ 570 inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 571 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, 572 private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 573 CeedScalar r_t1[T_1D]; 574 CeedScalar r_t2[T_1D]; 575 576 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 577 ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 578 ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 579 ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 580 ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 581 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 582 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 583 } 584 } 585 586 //------------------------------------------------------------------------------ 587 // 3D quadrature weights 588 //------------------------------------------------------------------------------ 589 // template <int Q_1D> 590 inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 591 const CeedInt item_id_x = get_local_id(0); 592 const CeedInt item_id_y = get_local_id(1); 593 594 if (item_id_x < Q_1D && item_id_y < Q_1D) { 595 const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y]; 596 for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q]; 597 } else { 598 for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0; 599 } 600 } 601 602 //------------------------------------------------------------------------------ 603 604 #endif 605