1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for SYCL shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 20 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 21 const CeedInt item_id_x = get_local_id(0); 22 23 scratch[item_id_x] = *U; 24 work_group_barrier(CLK_LOCAL_MEM_FENCE); 25 26 *V = 0.0; 27 if (item_id_x < Q_1D) { 28 for (CeedInt i = 0; i < P_1D; i++) { 29 *V += B[i + item_id_x * P_1D] * scratch[i]; // Contract x direction 30 } 31 } 32 work_group_barrier(CLK_LOCAL_MEM_FENCE); 33 } 34 35 //------------------------------------------------------------------------------ 36 // 1D transpose tensor contraction x 37 //------------------------------------------------------------------------------ 38 inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 39 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 40 const CeedInt item_id_x = get_local_id(0); 41 42 scratch[item_id_x] = *U; 43 work_group_barrier(CLK_LOCAL_MEM_FENCE); 44 45 *V = 0.0; 46 if (item_id_x < P_1D) { 47 for (CeedInt i = 0; i < Q_1D; i++) { 48 *V += B[item_id_x + i * P_1D] * scratch[i]; // Contract x direction 49 } 50 } 51 work_group_barrier(CLK_LOCAL_MEM_FENCE); 52 } 53 54 //------------------------------------------------------------------------------ 55 // 1D interpolate to quadrature points 56 //------------------------------------------------------------------------------ 57 inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 58 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 59 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 60 ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 61 } 62 } 63 64 //------------------------------------------------------------------------------ 65 // 1D interpolate transpose 66 //------------------------------------------------------------------------------ 67 inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 68 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 69 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 70 ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 71 } 72 } 73 74 //------------------------------------------------------------------------------ 75 // 1D derivatives at quadrature points 76 //------------------------------------------------------------------------------ 77 inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 78 local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 79 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 80 ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 81 } 82 } 83 84 //------------------------------------------------------------------------------ 85 // 1D derivatives transpose 86 //------------------------------------------------------------------------------ 87 inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 88 local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 89 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 90 ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 91 } 92 } 93 94 //------------------------------------------------------------------------------ 95 // 1D quadrature weights 96 //------------------------------------------------------------------------------ 97 inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 98 const CeedInt item_id_x = get_local_id(0); 99 *w = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0; 100 } 101 102 //------------------------------------------------------------------------------ 103 // 2D 104 //------------------------------------------------------------------------------ 105 106 //------------------------------------------------------------------------------ 107 // 2D tensor contraction x 108 //------------------------------------------------------------------------------ 109 inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 110 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 111 const CeedInt item_id_x = get_local_id(0); 112 const CeedInt item_id_y = get_local_id(1); 113 114 scratch[item_id_x + item_id_y * T_1D] = *U; 115 work_group_barrier(CLK_LOCAL_MEM_FENCE); 116 117 *V = 0.0; 118 if (item_id_x < Q_1D && item_id_y < P_1D) { 119 for (CeedInt i = 0; i < P_1D; i++) { 120 *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 121 } 122 } 123 work_group_barrier(CLK_LOCAL_MEM_FENCE); 124 } 125 126 //------------------------------------------------------------------------------ 127 // 2D tensor contract y 128 //------------------------------------------------------------------------------ 129 inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 130 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 131 const CeedInt item_id_x = get_local_id(0); 132 const CeedInt item_id_y = get_local_id(1); 133 134 scratch[item_id_x + item_id_y * T_1D] = *U; 135 work_group_barrier(CLK_LOCAL_MEM_FENCE); 136 137 *V = 0.0; 138 if (item_id_x < Q_1D && item_id_y < Q_1D) { 139 for (CeedInt i = 0; i < P_1D; i++) { 140 *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 141 } 142 } 143 work_group_barrier(CLK_LOCAL_MEM_FENCE); 144 } 145 146 //------------------------------------------------------------------------------ 147 // 2D transpose tensor contract y 148 //------------------------------------------------------------------------------ 149 inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 150 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 151 const CeedInt item_id_x = get_local_id(0); 152 const CeedInt item_id_y = get_local_id(1); 153 154 scratch[item_id_x + item_id_y * T_1D] = *U; 155 work_group_barrier(CLK_LOCAL_MEM_FENCE); 156 157 *V = 0.0; 158 if (item_id_x < Q_1D && item_id_y < P_1D) { 159 for (CeedInt i = 0; i < Q_1D; i++) { 160 *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 161 } 162 } 163 work_group_barrier(CLK_LOCAL_MEM_FENCE); 164 } 165 166 //------------------------------------------------------------------------------ 167 // 2D transpose tensor contract x 168 //------------------------------------------------------------------------------ 169 inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 170 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 171 const CeedInt item_id_x = get_local_id(0); 172 const CeedInt item_id_y = get_local_id(1); 173 174 scratch[item_id_x + item_id_y * T_1D] = *U; 175 work_group_barrier(CLK_LOCAL_MEM_FENCE); 176 177 *V = 0.0; 178 if (item_id_x < P_1D && item_id_y < P_1D) { 179 for (CeedInt i = 0; i < Q_1D; i++) { 180 *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 181 } 182 } 183 work_group_barrier(CLK_LOCAL_MEM_FENCE); 184 } 185 186 //------------------------------------------------------------------------------ 187 // 2D transpose tensor contract and add x 188 //------------------------------------------------------------------------------ 189 inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 190 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 191 const CeedInt item_id_x = get_local_id(0); 192 const CeedInt item_id_y = get_local_id(1); 193 194 scratch[item_id_x + item_id_y * T_1D] = *U; 195 work_group_barrier(CLK_LOCAL_MEM_FENCE); 196 197 if (item_id_x < P_1D && item_id_y < P_1D) { 198 for (CeedInt i = 0; i < Q_1D; i++) { 199 *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 200 } 201 } 202 work_group_barrier(CLK_LOCAL_MEM_FENCE); 203 } 204 205 //------------------------------------------------------------------------------ 206 // 2D interpolate to quadrature points 207 //------------------------------------------------------------------------------ 208 inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 209 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 210 CeedScalar r_t[1]; 211 212 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 213 ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 214 ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 215 } 216 } 217 218 //------------------------------------------------------------------------------ 219 // 2D interpolate transpose 220 //------------------------------------------------------------------------------ 221 inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 222 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 223 CeedScalar r_t[1]; 224 225 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 226 ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 227 ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 228 } 229 } 230 231 //------------------------------------------------------------------------------ 232 // 2D derivatives at quadrature points 233 //------------------------------------------------------------------------------ 234 inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 235 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 236 local CeedScalar *restrict scratch) { 237 CeedScalar r_t[1]; 238 239 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 240 ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch); 241 ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch); 242 ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 243 ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch); 244 } 245 } 246 247 //------------------------------------------------------------------------------ 248 // 2D derivatives transpose 249 //------------------------------------------------------------------------------ 250 inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 251 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 252 local CeedScalar *restrict scratch) { 253 CeedScalar r_t[1]; 254 255 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 256 ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch); 257 ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch); 258 ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch); 259 ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 260 } 261 } 262 263 //------------------------------------------------------------------------------ 264 // 2D quadrature weights 265 //------------------------------------------------------------------------------ 266 inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 267 const CeedInt item_id_x = get_local_id(0); 268 const CeedInt item_id_y = get_local_id(1); 269 270 *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0; 271 } 272 273 //------------------------------------------------------------------------------ 274 // 3D 275 //------------------------------------------------------------------------------ 276 277 //------------------------------------------------------------------------------ 278 // 3D tensor contract x 279 //------------------------------------------------------------------------------ 280 inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 281 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 282 const CeedInt item_id_x = get_local_id(0); 283 const CeedInt item_id_y = get_local_id(1); 284 285 CeedScalar r_B[T_1D]; 286 for (CeedInt i = 0; i < P_1D; i++) { 287 r_B[i] = B[i + item_id_x * P_1D]; 288 } 289 290 for (CeedInt k = 0; k < P_1D; k++) { 291 scratch[item_id_x + item_id_y * T_1D] = U[k]; 292 work_group_barrier(CLK_LOCAL_MEM_FENCE); 293 294 V[k] = 0.0; 295 if (item_id_x < Q_1D && item_id_y < P_1D) { 296 for (CeedInt i = 0; i < P_1D; i++) { 297 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 298 } 299 } 300 work_group_barrier(CLK_LOCAL_MEM_FENCE); 301 } 302 } 303 304 //------------------------------------------------------------------------------ 305 // 3D tensor contract y 306 //------------------------------------------------------------------------------ 307 inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 308 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 309 const CeedInt item_id_x = get_local_id(0); 310 const CeedInt item_id_y = get_local_id(1); 311 312 CeedScalar r_B[T_1D]; 313 for (CeedInt i = 0; i < P_1D; i++) { 314 r_B[i] = B[i + item_id_y * P_1D]; 315 } 316 317 for (CeedInt k = 0; k < P_1D; k++) { 318 scratch[item_id_x + item_id_y * T_1D] = U[k]; 319 work_group_barrier(CLK_LOCAL_MEM_FENCE); 320 321 V[k] = 0.0; 322 if (item_id_x < Q_1D && item_id_y < Q_1D) { 323 for (CeedInt i = 0; i < P_1D; i++) { 324 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 325 } 326 } 327 work_group_barrier(CLK_LOCAL_MEM_FENCE); 328 } 329 } 330 331 //------------------------------------------------------------------------------ 332 // 3D tensor contract z 333 //------------------------------------------------------------------------------ 334 inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 335 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 336 const CeedInt item_id_x = get_local_id(0); 337 const CeedInt item_id_y = get_local_id(1); 338 339 for (CeedInt k = 0; k < Q_1D; k++) { 340 V[k] = 0.0; 341 if (item_id_x < Q_1D && item_id_y < Q_1D) { 342 for (CeedInt i = 0; i < P_1D; i++) { 343 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 344 } 345 } 346 } 347 } 348 349 //------------------------------------------------------------------------------ 350 // 3D transpose tensor contract z 351 //------------------------------------------------------------------------------ 352 inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 353 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 354 const CeedInt item_id_x = get_local_id(0); 355 const CeedInt item_id_y = get_local_id(1); 356 357 for (CeedInt k = 0; k < P_1D; k++) { 358 V[k] = 0.0; 359 if (item_id_x < Q_1D && item_id_y < Q_1D) { 360 for (CeedInt i = 0; i < Q_1D; i++) { 361 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 362 } 363 } 364 } 365 } 366 367 //------------------------------------------------------------------------------ 368 // 3D transpose tensor contract y 369 //------------------------------------------------------------------------------ 370 inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 371 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 372 const CeedInt item_id_x = get_local_id(0); 373 const CeedInt item_id_y = get_local_id(1); 374 375 CeedScalar r_B[T_1D]; 376 for (CeedInt i = 0; i < Q_1D; i++) { 377 r_B[i] = B[item_id_y + i * P_1D]; 378 } 379 380 for (CeedInt k = 0; k < P_1D; k++) { 381 scratch[item_id_x + item_id_y * T_1D] = U[k]; 382 work_group_barrier(CLK_LOCAL_MEM_FENCE); 383 384 V[k] = 0.0; 385 if (item_id_x < Q_1D && item_id_y < P_1D) { 386 for (CeedInt i = 0; i < Q_1D; i++) { 387 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 388 } 389 } 390 work_group_barrier(CLK_LOCAL_MEM_FENCE); 391 } 392 } 393 394 //------------------------------------------------------------------------------ 395 // 3D transpose tensor contract y 396 //------------------------------------------------------------------------------ 397 inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 398 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 399 const CeedInt item_id_x = get_local_id(0); 400 const CeedInt item_id_y = get_local_id(1); 401 402 CeedScalar r_B[T_1D]; 403 for (CeedInt i = 0; i < Q_1D; i++) { 404 r_B[i] = B[item_id_y + i * P_1D]; 405 } 406 407 for (CeedInt k = 0; k < P_1D; k++) { 408 scratch[item_id_x + item_id_y * T_1D] = U[k]; 409 work_group_barrier(CLK_LOCAL_MEM_FENCE); 410 if (item_id_x < Q_1D && item_id_y < P_1D) { 411 for (CeedInt i = 0; i < Q_1D; i++) { 412 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 413 } 414 } 415 work_group_barrier(CLK_LOCAL_MEM_FENCE); 416 } 417 } 418 419 //------------------------------------------------------------------------------ 420 // 3D transpose tensor contract x 421 //------------------------------------------------------------------------------ 422 inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 423 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 424 const CeedInt item_id_x = get_local_id(0); 425 const CeedInt item_id_y = get_local_id(1); 426 427 CeedScalar r_B[T_1D]; 428 for (CeedInt i = 0; i < Q_1D; i++) { 429 r_B[i] = B[item_id_x + i * P_1D]; 430 } 431 432 for (CeedInt k = 0; k < P_1D; k++) { 433 scratch[item_id_x + item_id_y * T_1D] = U[k]; 434 work_group_barrier(CLK_LOCAL_MEM_FENCE); 435 V[k] = 0.0; 436 if (item_id_x < P_1D && item_id_y < P_1D) { 437 for (CeedInt i = 0; i < Q_1D; i++) { 438 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 439 } 440 } 441 work_group_barrier(CLK_LOCAL_MEM_FENCE); 442 } 443 } 444 445 //------------------------------------------------------------------------------ 446 // 3D transpose tensor contract add x 447 //------------------------------------------------------------------------------ 448 inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 449 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 450 const CeedInt item_id_x = get_local_id(0); 451 const CeedInt item_id_y = get_local_id(1); 452 453 CeedScalar r_B[T_1D]; 454 for (CeedInt i = 0; i < Q_1D; i++) { 455 r_B[i] = B[item_id_x + i * P_1D]; 456 } 457 458 for (CeedInt k = 0; k < P_1D; k++) { 459 scratch[item_id_x + item_id_y * T_1D] = U[k]; 460 work_group_barrier(CLK_LOCAL_MEM_FENCE); 461 462 if (item_id_x < P_1D && item_id_y < P_1D) { 463 for (CeedInt i = 0; i < Q_1D; i++) { 464 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 465 } 466 } 467 work_group_barrier(CLK_LOCAL_MEM_FENCE); 468 } 469 } 470 471 //------------------------------------------------------------------------------ 472 // 3D interpolate to quadrature points 473 //------------------------------------------------------------------------------ 474 inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 475 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 476 CeedScalar r_t1[T_1D]; 477 CeedScalar r_t2[T_1D]; 478 479 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 480 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 481 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 482 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch); 483 } 484 } 485 486 //------------------------------------------------------------------------------ 487 // 3D interpolate transpose 488 //------------------------------------------------------------------------------ 489 inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 490 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 491 CeedScalar r_t1[T_1D]; 492 CeedScalar r_t2[T_1D]; 493 494 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 495 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch); 496 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 497 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 498 } 499 } 500 501 //------------------------------------------------------------------------------ 502 // 3D derivatives at quadrature points 503 //------------------------------------------------------------------------------ 504 inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 505 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 506 local CeedScalar *restrict scratch) { 507 CeedScalar r_t1[T_1D]; 508 CeedScalar r_t2[T_1D]; 509 510 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 511 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch); 512 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 513 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 514 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 515 ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 516 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 517 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 518 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 519 ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 520 } 521 } 522 523 //------------------------------------------------------------------------------ 524 // 3D derivatives transpose 525 //------------------------------------------------------------------------------ 526 inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 527 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 528 local CeedScalar *restrict scratch) { 529 CeedScalar r_t1[T_1D]; 530 CeedScalar r_t2[T_1D]; 531 532 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 533 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 534 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 535 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch); 536 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 537 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 538 ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 539 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch); 540 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 541 ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 542 } 543 } 544 545 //------------------------------------------------------------------------------ 546 // 3D derivatives at quadrature points 547 //------------------------------------------------------------------------------ 548 inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 549 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 550 local CeedScalar *restrict scratch) { 551 CeedScalar r_t1[T_1D]; 552 CeedScalar r_t2[T_1D]; 553 554 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 555 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 556 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 557 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 558 ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 559 ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 560 ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 561 } 562 } 563 564 //------------------------------------------------------------------------------ 565 // 3D derivatives transpose 566 //------------------------------------------------------------------------------ 567 inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 568 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, 569 private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 570 CeedScalar r_t1[T_1D]; 571 CeedScalar r_t2[T_1D]; 572 573 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 574 ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 575 ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 576 ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 577 ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 578 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 579 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 580 } 581 } 582 583 //------------------------------------------------------------------------------ 584 // 3D quadrature weights 585 //------------------------------------------------------------------------------ 586 // template <int Q_1D> 587 inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 588 const CeedInt item_id_x = get_local_id(0); 589 const CeedInt item_id_y = get_local_id(1); 590 591 if (item_id_x < Q_1D && item_id_y < Q_1D) { 592 const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y]; 593 for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q]; 594 } else { 595 for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0; 596 } 597 } 598