1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for SYCL shared memory tensor product basis templates 10 11 #include <ceed/types.h> 12 13 //------------------------------------------------------------------------------ 14 // 1D 15 //------------------------------------------------------------------------------ 16 17 //------------------------------------------------------------------------------ 18 // 1D tensor contraction x 19 //------------------------------------------------------------------------------ 20 inline void ContractX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 21 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 22 const CeedInt item_id_x = get_local_id(0); 23 24 scratch[item_id_x] = *U; 25 work_group_barrier(CLK_LOCAL_MEM_FENCE); 26 27 *V = 0.0; 28 if (item_id_x < Q_1D) { 29 for (CeedInt i = 0; i < P_1D; i++) { 30 *V += B[i + item_id_x * P_1D] * scratch[i]; // Contract x direction 31 } 32 } 33 work_group_barrier(CLK_LOCAL_MEM_FENCE); 34 } 35 36 //------------------------------------------------------------------------------ 37 // 1D transpose tensor contraction x 38 //------------------------------------------------------------------------------ 39 inline void ContractTransposeX1d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 40 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 41 const CeedInt item_id_x = get_local_id(0); 42 43 scratch[item_id_x] = *U; 44 work_group_barrier(CLK_LOCAL_MEM_FENCE); 45 46 *V = 0.0; 47 if (item_id_x < P_1D) { 48 for (CeedInt i = 0; i < Q_1D; i++) { 49 *V += B[item_id_x + i * P_1D] * scratch[i]; // Contract x direction 50 } 51 } 52 work_group_barrier(CLK_LOCAL_MEM_FENCE); 53 } 54 55 //------------------------------------------------------------------------------ 56 // 1D interpolate to quadrature points 57 //------------------------------------------------------------------------------ 58 inline void Interp1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 59 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 60 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 61 ContractX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 62 } 63 } 64 65 //------------------------------------------------------------------------------ 66 // 1D interpolate transpose 67 //------------------------------------------------------------------------------ 68 inline void InterpTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 69 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 70 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 71 ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_B, r_V + comp, scratch); 72 } 73 } 74 75 //------------------------------------------------------------------------------ 76 // 1D derivatives at quadrature points 77 //------------------------------------------------------------------------------ 78 inline void Grad1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 79 local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 80 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 81 ContractX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 82 } 83 } 84 85 //------------------------------------------------------------------------------ 86 // 1D derivatives transpose 87 //------------------------------------------------------------------------------ 88 inline void GradTranspose1d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 89 local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 90 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 91 ContractTransposeX1d(P_1D, Q_1D, r_U + comp, s_G, r_V + comp, scratch); 92 } 93 } 94 95 //------------------------------------------------------------------------------ 96 // 1D quadrature weights 97 //------------------------------------------------------------------------------ 98 inline void Weight1d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 99 const CeedInt item_id_x = get_local_id(0); 100 *w = (item_id_x < Q_1D) ? q_weight_1d[item_id_x] : 0.0; 101 } 102 103 //------------------------------------------------------------------------------ 104 // 2D 105 //------------------------------------------------------------------------------ 106 107 //------------------------------------------------------------------------------ 108 // 2D tensor contraction x 109 //------------------------------------------------------------------------------ 110 inline void ContractX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 111 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 112 const CeedInt item_id_x = get_local_id(0); 113 const CeedInt item_id_y = get_local_id(1); 114 115 scratch[item_id_x + item_id_y * T_1D] = *U; 116 work_group_barrier(CLK_LOCAL_MEM_FENCE); 117 118 *V = 0.0; 119 if (item_id_x < Q_1D && item_id_y < P_1D) { 120 for (CeedInt i = 0; i < P_1D; i++) { 121 *V += B[i + item_id_x * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 122 } 123 } 124 work_group_barrier(CLK_LOCAL_MEM_FENCE); 125 } 126 127 //------------------------------------------------------------------------------ 128 // 2D tensor contract y 129 //------------------------------------------------------------------------------ 130 inline void ContractY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 131 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 132 const CeedInt item_id_x = get_local_id(0); 133 const CeedInt item_id_y = get_local_id(1); 134 135 scratch[item_id_x + item_id_y * T_1D] = *U; 136 work_group_barrier(CLK_LOCAL_MEM_FENCE); 137 138 *V = 0.0; 139 if (item_id_x < Q_1D && item_id_y < Q_1D) { 140 for (CeedInt i = 0; i < P_1D; i++) { 141 *V += B[i + item_id_y * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 142 } 143 } 144 work_group_barrier(CLK_LOCAL_MEM_FENCE); 145 } 146 147 //------------------------------------------------------------------------------ 148 // 2D transpose tensor contract y 149 //------------------------------------------------------------------------------ 150 inline void ContractTransposeY2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 151 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 152 const CeedInt item_id_x = get_local_id(0); 153 const CeedInt item_id_y = get_local_id(1); 154 155 scratch[item_id_x + item_id_y * T_1D] = *U; 156 work_group_barrier(CLK_LOCAL_MEM_FENCE); 157 158 *V = 0.0; 159 if (item_id_x < Q_1D && item_id_y < P_1D) { 160 for (CeedInt i = 0; i < Q_1D; i++) { 161 *V += B[item_id_y + i * P_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction 162 } 163 } 164 work_group_barrier(CLK_LOCAL_MEM_FENCE); 165 } 166 167 //------------------------------------------------------------------------------ 168 // 2D transpose tensor contract x 169 //------------------------------------------------------------------------------ 170 inline void ContractTransposeX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 171 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 172 const CeedInt item_id_x = get_local_id(0); 173 const CeedInt item_id_y = get_local_id(1); 174 175 scratch[item_id_x + item_id_y * T_1D] = *U; 176 work_group_barrier(CLK_LOCAL_MEM_FENCE); 177 178 *V = 0.0; 179 if (item_id_x < P_1D && item_id_y < P_1D) { 180 for (CeedInt i = 0; i < Q_1D; i++) { 181 *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 182 } 183 } 184 work_group_barrier(CLK_LOCAL_MEM_FENCE); 185 } 186 187 //------------------------------------------------------------------------------ 188 // 2D transpose tensor contract and add x 189 //------------------------------------------------------------------------------ 190 inline void ContractTransposeAddX2d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 191 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 192 const CeedInt item_id_x = get_local_id(0); 193 const CeedInt item_id_y = get_local_id(1); 194 195 scratch[item_id_x + item_id_y * T_1D] = *U; 196 work_group_barrier(CLK_LOCAL_MEM_FENCE); 197 198 if (item_id_x < P_1D && item_id_y < P_1D) { 199 for (CeedInt i = 0; i < Q_1D; i++) { 200 *V += B[item_id_x + i * P_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction 201 } 202 } 203 work_group_barrier(CLK_LOCAL_MEM_FENCE); 204 } 205 206 //------------------------------------------------------------------------------ 207 // 2D interpolate to quadrature points 208 //------------------------------------------------------------------------------ 209 inline void InterpTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 210 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 211 CeedScalar r_t[1]; 212 213 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 214 ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 215 ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 216 } 217 } 218 219 //------------------------------------------------------------------------------ 220 // 2D interpolate transpose 221 //------------------------------------------------------------------------------ 222 inline void InterpTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 223 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 224 CeedScalar r_t[1]; 225 226 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 227 ContractTransposeY2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 228 ContractTransposeX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 229 } 230 } 231 232 //------------------------------------------------------------------------------ 233 // 2D derivatives at quadrature points 234 //------------------------------------------------------------------------------ 235 inline void GradTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 236 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 237 local CeedScalar *restrict scratch) { 238 CeedScalar r_t[1]; 239 240 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 241 ContractX2d(P_1D, Q_1D, r_U + comp, s_G, r_t, scratch); 242 ContractY2d(P_1D, Q_1D, r_t, s_B, r_V + comp + 0 * NUM_COMP, scratch); 243 ContractX2d(P_1D, Q_1D, r_U + comp, s_B, r_t, scratch); 244 ContractY2d(P_1D, Q_1D, r_t, s_G, r_V + comp + 1 * NUM_COMP, scratch); 245 } 246 } 247 248 //------------------------------------------------------------------------------ 249 // 2D derivatives transpose 250 //------------------------------------------------------------------------------ 251 inline void GradTransposeTensor2d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 252 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 253 local CeedScalar *restrict scratch) { 254 CeedScalar r_t[1]; 255 256 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 257 ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 0 * NUM_COMP, s_B, r_t, scratch); 258 ContractTransposeX2d(P_1D, Q_1D, r_t, s_G, r_V + comp, scratch); 259 ContractTransposeY2d(P_1D, Q_1D, r_U + comp + 1 * NUM_COMP, s_G, r_t, scratch); 260 ContractTransposeAddX2d(P_1D, Q_1D, r_t, s_B, r_V + comp, scratch); 261 } 262 } 263 264 //------------------------------------------------------------------------------ 265 // 2D quadrature weights 266 //------------------------------------------------------------------------------ 267 inline void WeightTensor2d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 268 const CeedInt item_id_x = get_local_id(0); 269 const CeedInt item_id_y = get_local_id(1); 270 271 *w = (item_id_x < Q_1D && item_id_y < Q_1D) ? q_weight_1d[item_id_x] * q_weight_1d[item_id_y] : 0.0; 272 } 273 274 //------------------------------------------------------------------------------ 275 // 3D 276 //------------------------------------------------------------------------------ 277 278 //------------------------------------------------------------------------------ 279 // 3D tensor contract x 280 //------------------------------------------------------------------------------ 281 inline void ContractX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 282 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 283 const CeedInt item_id_x = get_local_id(0); 284 const CeedInt item_id_y = get_local_id(1); 285 286 CeedScalar r_B[T_1D]; 287 for (CeedInt i = 0; i < P_1D; i++) { 288 r_B[i] = B[i + item_id_x * P_1D]; 289 } 290 291 for (CeedInt k = 0; k < P_1D; k++) { 292 scratch[item_id_x + item_id_y * T_1D] = U[k]; 293 work_group_barrier(CLK_LOCAL_MEM_FENCE); 294 295 V[k] = 0.0; 296 if (item_id_x < Q_1D && item_id_y < P_1D) { 297 for (CeedInt i = 0; i < P_1D; i++) { 298 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 299 } 300 } 301 work_group_barrier(CLK_LOCAL_MEM_FENCE); 302 } 303 } 304 305 //------------------------------------------------------------------------------ 306 // 3D tensor contract y 307 //------------------------------------------------------------------------------ 308 inline void ContractY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 309 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 310 const CeedInt item_id_x = get_local_id(0); 311 const CeedInt item_id_y = get_local_id(1); 312 313 CeedScalar r_B[T_1D]; 314 for (CeedInt i = 0; i < P_1D; i++) { 315 r_B[i] = B[i + item_id_y * P_1D]; 316 } 317 318 for (CeedInt k = 0; k < P_1D; k++) { 319 scratch[item_id_x + item_id_y * T_1D] = U[k]; 320 work_group_barrier(CLK_LOCAL_MEM_FENCE); 321 322 V[k] = 0.0; 323 if (item_id_x < Q_1D && item_id_y < Q_1D) { 324 for (CeedInt i = 0; i < P_1D; i++) { 325 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 326 } 327 } 328 work_group_barrier(CLK_LOCAL_MEM_FENCE); 329 } 330 } 331 332 //------------------------------------------------------------------------------ 333 // 3D tensor contract z 334 //------------------------------------------------------------------------------ 335 inline void ContractZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 336 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 337 const CeedInt item_id_x = get_local_id(0); 338 const CeedInt item_id_y = get_local_id(1); 339 340 for (CeedInt k = 0; k < Q_1D; k++) { 341 V[k] = 0.0; 342 if (item_id_x < Q_1D && item_id_y < Q_1D) { 343 for (CeedInt i = 0; i < P_1D; i++) { 344 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 345 } 346 } 347 } 348 } 349 350 //------------------------------------------------------------------------------ 351 // 3D transpose tensor contract z 352 //------------------------------------------------------------------------------ 353 inline void ContractTransposeZ3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 354 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 355 const CeedInt item_id_x = get_local_id(0); 356 const CeedInt item_id_y = get_local_id(1); 357 358 for (CeedInt k = 0; k < P_1D; k++) { 359 V[k] = 0.0; 360 if (item_id_x < Q_1D && item_id_y < Q_1D) { 361 for (CeedInt i = 0; i < Q_1D; i++) { 362 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 363 } 364 } 365 } 366 } 367 368 //------------------------------------------------------------------------------ 369 // 3D transpose tensor contract y 370 //------------------------------------------------------------------------------ 371 inline void ContractTransposeY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 372 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 373 const CeedInt item_id_x = get_local_id(0); 374 const CeedInt item_id_y = get_local_id(1); 375 376 CeedScalar r_B[T_1D]; 377 for (CeedInt i = 0; i < Q_1D; i++) { 378 r_B[i] = B[item_id_y + i * P_1D]; 379 } 380 381 for (CeedInt k = 0; k < P_1D; k++) { 382 scratch[item_id_x + item_id_y * T_1D] = U[k]; 383 work_group_barrier(CLK_LOCAL_MEM_FENCE); 384 385 V[k] = 0.0; 386 if (item_id_x < Q_1D && item_id_y < P_1D) { 387 for (CeedInt i = 0; i < Q_1D; i++) { 388 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 389 } 390 } 391 work_group_barrier(CLK_LOCAL_MEM_FENCE); 392 } 393 } 394 395 //------------------------------------------------------------------------------ 396 // 3D transpose tensor contract y 397 //------------------------------------------------------------------------------ 398 inline void ContractTransposeAddY3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 399 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 400 const CeedInt item_id_x = get_local_id(0); 401 const CeedInt item_id_y = get_local_id(1); 402 403 CeedScalar r_B[T_1D]; 404 for (CeedInt i = 0; i < Q_1D; i++) { 405 r_B[i] = B[item_id_y + i * P_1D]; 406 } 407 408 for (CeedInt k = 0; k < P_1D; k++) { 409 scratch[item_id_x + item_id_y * T_1D] = U[k]; 410 work_group_barrier(CLK_LOCAL_MEM_FENCE); 411 if (item_id_x < Q_1D && item_id_y < P_1D) { 412 for (CeedInt i = 0; i < Q_1D; i++) { 413 V[k] += r_B[i] * scratch[item_id_x + i * T_1D]; // Contract y direction 414 } 415 } 416 work_group_barrier(CLK_LOCAL_MEM_FENCE); 417 } 418 } 419 420 //------------------------------------------------------------------------------ 421 // 3D transpose tensor contract x 422 //------------------------------------------------------------------------------ 423 inline void ContractTransposeX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 424 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 425 const CeedInt item_id_x = get_local_id(0); 426 const CeedInt item_id_y = get_local_id(1); 427 428 CeedScalar r_B[T_1D]; 429 for (CeedInt i = 0; i < Q_1D; i++) { 430 r_B[i] = B[item_id_x + i * P_1D]; 431 } 432 433 for (CeedInt k = 0; k < P_1D; k++) { 434 scratch[item_id_x + item_id_y * T_1D] = U[k]; 435 work_group_barrier(CLK_LOCAL_MEM_FENCE); 436 V[k] = 0.0; 437 if (item_id_x < P_1D && item_id_y < P_1D) { 438 for (CeedInt i = 0; i < Q_1D; i++) { 439 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 440 } 441 } 442 work_group_barrier(CLK_LOCAL_MEM_FENCE); 443 } 444 } 445 446 //------------------------------------------------------------------------------ 447 // 3D transpose tensor contract add x 448 //------------------------------------------------------------------------------ 449 inline void ContractTransposeAddX3d(const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict U, local const CeedScalar *restrict B, 450 private CeedScalar *restrict V, local CeedScalar *restrict scratch) { 451 const CeedInt item_id_x = get_local_id(0); 452 const CeedInt item_id_y = get_local_id(1); 453 454 CeedScalar r_B[T_1D]; 455 for (CeedInt i = 0; i < Q_1D; i++) { 456 r_B[i] = B[item_id_x + i * P_1D]; 457 } 458 459 for (CeedInt k = 0; k < P_1D; k++) { 460 scratch[item_id_x + item_id_y * T_1D] = U[k]; 461 work_group_barrier(CLK_LOCAL_MEM_FENCE); 462 463 if (item_id_x < P_1D && item_id_y < P_1D) { 464 for (CeedInt i = 0; i < Q_1D; i++) { 465 V[k] += r_B[i] * scratch[i + item_id_y * T_1D]; // Contract x direction 466 } 467 } 468 work_group_barrier(CLK_LOCAL_MEM_FENCE); 469 } 470 } 471 472 //------------------------------------------------------------------------------ 473 // 3D interpolate to quadrature points 474 //------------------------------------------------------------------------------ 475 inline void InterpTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 476 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 477 CeedScalar r_t1[T_1D]; 478 CeedScalar r_t2[T_1D]; 479 480 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 481 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 482 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 483 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D, scratch); 484 } 485 } 486 487 //------------------------------------------------------------------------------ 488 // 3D interpolate transpose 489 //------------------------------------------------------------------------------ 490 inline void InterpTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 491 local const CeedScalar *restrict s_B, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 492 CeedScalar r_t1[T_1D]; 493 CeedScalar r_t2[T_1D]; 494 495 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 496 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D, s_B, r_t1, scratch); 497 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 498 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 499 } 500 } 501 502 //------------------------------------------------------------------------------ 503 // 3D derivatives at quadrature points 504 //------------------------------------------------------------------------------ 505 inline void GradTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 506 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 507 local CeedScalar *restrict scratch) { 508 CeedScalar r_t1[T_1D]; 509 CeedScalar r_t2[T_1D]; 510 511 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 512 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_G, r_t1, scratch); 513 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 514 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 515 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 516 ContractY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 517 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 518 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 519 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 520 ContractZ3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 521 } 522 } 523 524 //------------------------------------------------------------------------------ 525 // 3D derivatives transpose 526 //------------------------------------------------------------------------------ 527 inline void GradTransposeTensor3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 528 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 529 local CeedScalar *restrict scratch) { 530 CeedScalar r_t1[T_1D]; 531 CeedScalar r_t2[T_1D]; 532 533 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 534 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 535 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 536 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_G, r_V + comp * P_1D, scratch); 537 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_B, r_t1, scratch); 538 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_G, r_t2, scratch); 539 ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 540 ContractTransposeZ3d(P_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t1, scratch); 541 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 542 ContractTransposeAddX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 543 } 544 } 545 546 //------------------------------------------------------------------------------ 547 // 3D derivatives at quadrature points 548 //------------------------------------------------------------------------------ 549 inline void GradTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 550 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, private CeedScalar *restrict r_V, 551 local CeedScalar *restrict scratch) { 552 CeedScalar r_t1[T_1D]; 553 CeedScalar r_t2[T_1D]; 554 555 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 556 ContractX3d(P_1D, Q_1D, r_U + comp * P_1D, s_B, r_t1, scratch); 557 ContractY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 558 ContractZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 559 ContractX3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D, scratch); 560 ContractY3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D, scratch); 561 ContractZ3d(Q_1D, Q_1D, r_t1, s_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D, scratch); 562 } 563 } 564 565 //------------------------------------------------------------------------------ 566 // 3D derivatives transpose 567 //------------------------------------------------------------------------------ 568 inline void GradTransposeTensorCollocated3d(const CeedInt NUM_COMP, const CeedInt P_1D, const CeedInt Q_1D, private const CeedScalar *restrict r_U, 569 local const CeedScalar *restrict s_B, local const CeedScalar *restrict s_G, 570 private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 571 CeedScalar r_t1[T_1D]; 572 CeedScalar r_t2[T_1D]; 573 574 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 575 ContractTransposeZ3d(Q_1D, Q_1D, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 576 ContractTransposeAddY3d(Q_1D, Q_1D, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 577 ContractTransposeAddX3d(Q_1D, Q_1D, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, s_G, r_t2, scratch); 578 ContractTransposeZ3d(P_1D, Q_1D, r_t2, s_B, r_t1, scratch); 579 ContractTransposeY3d(P_1D, Q_1D, r_t1, s_B, r_t2, scratch); 580 ContractTransposeX3d(P_1D, Q_1D, r_t2, s_B, r_V + comp * P_1D, scratch); 581 } 582 } 583 584 //------------------------------------------------------------------------------ 585 // 3D quadrature weights 586 //------------------------------------------------------------------------------ 587 // template <int Q_1D> 588 inline void WeightTensor3d(const CeedInt Q_1D, const CeedScalar *restrict q_weight_1d, CeedScalar *restrict w) { 589 const CeedInt item_id_x = get_local_id(0); 590 const CeedInt item_id_y = get_local_id(1); 591 592 if (item_id_x < Q_1D && item_id_y < Q_1D) { 593 const CeedScalar w_xy = q_weight_1d[item_id_x] * q_weight_1d[item_id_y]; 594 for (CeedInt q = 0; q < Q_1D; ++q) w[q] = w_xy * q_weight_1d[q]; 595 } else { 596 for (CeedInt q = 0; q < Q_1D; q++) w[q] = 0.0; 597 } 598 } 599