1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D> 20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 21 __syncthreads(); 22 data.slice[data.t_id_x] = *U; 23 __syncthreads(); 24 *V = 0.0; 25 if (data.t_id_x < Q_1D) { 26 for (CeedInt i = 0; i < P_1D; i++) { 27 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 28 } 29 } 30 } 31 32 //------------------------------------------------------------------------------ 33 // 1D transpose tensor contraction x 34 //------------------------------------------------------------------------------ 35 template <int NUM_COMP, int P_1D, int Q_1D> 36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 37 __syncthreads(); 38 data.slice[data.t_id_x] = *U; 39 __syncthreads(); 40 *V = 0.0; 41 if (data.t_id_x < P_1D) { 42 for (CeedInt i = 0; i < Q_1D; i++) { 43 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 44 } 45 } 46 } 47 48 //------------------------------------------------------------------------------ 49 // 1D interpolate to quadrature points 50 //------------------------------------------------------------------------------ 51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 // 1D interpolate transpose 60 //------------------------------------------------------------------------------ 61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 63 CeedScalar *__restrict__ r_V) { 64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D derivatives at quadrature points 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 73 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 74 CeedScalar *__restrict__ r_V) { 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 77 } 78 } 79 80 //------------------------------------------------------------------------------ 81 // 1D derivatives transpose 82 //------------------------------------------------------------------------------ 83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 84 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 85 CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D quadrature weights 93 //------------------------------------------------------------------------------ 94 template <int P_1D, int Q_1D> 95 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 96 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 97 } 98 99 //------------------------------------------------------------------------------ 100 // 2D 101 //------------------------------------------------------------------------------ 102 103 //------------------------------------------------------------------------------ 104 // 2D tensor contraction x 105 //------------------------------------------------------------------------------ 106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 107 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 108 __syncthreads(); 109 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 110 __syncthreads(); 111 *V = 0.0; 112 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 113 for (CeedInt i = 0; i < P_1D; i++) { 114 *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 115 } 116 } 117 } 118 119 //------------------------------------------------------------------------------ 120 // 2D tensor contract y 121 //------------------------------------------------------------------------------ 122 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 123 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 124 __syncthreads(); 125 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 126 __syncthreads(); 127 *V = 0.0; 128 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 129 for (CeedInt i = 0; i < P_1D; i++) { 130 *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 131 } 132 } 133 } 134 135 //------------------------------------------------------------------------------ 136 // 2D transpose tensor contract y 137 //------------------------------------------------------------------------------ 138 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 139 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 140 __syncthreads(); 141 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 142 __syncthreads(); 143 *V = 0.0; 144 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 145 for (CeedInt i = 0; i < Q_1D; i++) { 146 *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 147 } 148 } 149 } 150 151 //------------------------------------------------------------------------------ 152 // 2D transpose tensor contract x 153 //------------------------------------------------------------------------------ 154 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 155 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 156 __syncthreads(); 157 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 158 __syncthreads(); 159 *V = 0.0; 160 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 161 for (CeedInt i = 0; i < Q_1D; i++) { 162 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 163 } 164 } 165 } 166 167 //------------------------------------------------------------------------------ 168 // 2D transpose tensor contract and add x 169 //------------------------------------------------------------------------------ 170 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 171 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 172 __syncthreads(); 173 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 174 __syncthreads(); 175 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 176 for (CeedInt i = 0; i < Q_1D; i++) { 177 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 178 } 179 } 180 } 181 182 //------------------------------------------------------------------------------ 183 // 2D interpolate to quadrature points 184 //------------------------------------------------------------------------------ 185 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 186 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 187 CeedScalar *__restrict__ r_V) { 188 CeedScalar r_t[1]; 189 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 190 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 191 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 192 } 193 } 194 195 //------------------------------------------------------------------------------ 196 // 2D interpolate transpose 197 //------------------------------------------------------------------------------ 198 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 199 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 200 CeedScalar *__restrict__ r_V) { 201 CeedScalar r_t[1]; 202 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 203 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 204 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 205 } 206 } 207 208 //------------------------------------------------------------------------------ 209 // 2D derivatives at quadrature points 210 //------------------------------------------------------------------------------ 211 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 212 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 213 CeedScalar *__restrict__ r_V) { 214 CeedScalar r_t[1]; 215 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 216 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t); 217 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 218 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 219 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 220 } 221 } 222 223 //------------------------------------------------------------------------------ 224 // 2D derivatives transpose 225 //------------------------------------------------------------------------------ 226 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 227 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 228 CeedScalar *__restrict__ r_V) { 229 CeedScalar r_t[1]; 230 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 231 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 232 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]); 233 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 234 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 235 } 236 } 237 238 //------------------------------------------------------------------------------ 239 // 2D derivatives at quadrature points, nodes and quadrature points collocated 240 //------------------------------------------------------------------------------ 241 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 242 inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 243 CeedScalar *__restrict__ r_V) { 244 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 245 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); 246 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); 247 } 248 } 249 250 //------------------------------------------------------------------------------ 251 // 2D derivatives transpose, nodes and quadrature points collocated 252 //------------------------------------------------------------------------------ 253 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 254 inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 255 CeedScalar *__restrict__ r_V) { 256 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 257 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); 258 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); 259 } 260 } 261 262 //------------------------------------------------------------------------------ 263 // 2D quadrature weights 264 //------------------------------------------------------------------------------ 265 template <int P_1D, int Q_1D> 266 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 267 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 268 } 269 270 //------------------------------------------------------------------------------ 271 // 3D 272 //------------------------------------------------------------------------------ 273 274 //------------------------------------------------------------------------------ 275 // 3D tensor contract x 276 //------------------------------------------------------------------------------ 277 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 278 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 279 CeedScalar r_B[P_1D]; 280 for (CeedInt i = 0; i < P_1D; i++) { 281 r_B[i] = B[i + data.t_id_x * P_1D]; 282 } 283 284 for (CeedInt k = 0; k < P_1D; k++) { 285 __syncthreads(); 286 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 287 __syncthreads(); 288 V[k] = 0.0; 289 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 290 for (CeedInt i = 0; i < P_1D; i++) { 291 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 292 } 293 } 294 } 295 } 296 297 //------------------------------------------------------------------------------ 298 // 3D tensor contract y 299 //------------------------------------------------------------------------------ 300 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 301 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 302 CeedScalar r_B[P_1D]; 303 for (CeedInt i = 0; i < P_1D; i++) { 304 r_B[i] = B[i + data.t_id_y * P_1D]; 305 } 306 307 for (CeedInt k = 0; k < P_1D; k++) { 308 __syncthreads(); 309 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 310 __syncthreads(); 311 V[k] = 0.0; 312 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 313 for (CeedInt i = 0; i < P_1D; i++) { 314 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 315 } 316 } 317 } 318 } 319 320 //------------------------------------------------------------------------------ 321 // 3D tensor contract z 322 //------------------------------------------------------------------------------ 323 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 324 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 325 for (CeedInt k = 0; k < Q_1D; k++) { 326 V[k] = 0.0; 327 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 328 for (CeedInt i = 0; i < P_1D; i++) { 329 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 330 } 331 } 332 } 333 } 334 335 //------------------------------------------------------------------------------ 336 // 3D transpose tensor contract z 337 //------------------------------------------------------------------------------ 338 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 339 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 340 for (CeedInt k = 0; k < P_1D; k++) { 341 V[k] = 0.0; 342 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 343 for (CeedInt i = 0; i < Q_1D; i++) { 344 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 345 } 346 } 347 } 348 } 349 350 //------------------------------------------------------------------------------ 351 // 3D transpose tensor contract y 352 //------------------------------------------------------------------------------ 353 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 354 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 355 CeedScalar r_B[Q_1D]; 356 for (CeedInt i = 0; i < Q_1D; i++) { 357 r_B[i] = B[data.t_id_y + i * P_1D]; 358 } 359 360 for (CeedInt k = 0; k < P_1D; k++) { 361 __syncthreads(); 362 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 363 __syncthreads(); 364 V[k] = 0.0; 365 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 366 for (CeedInt i = 0; i < Q_1D; i++) { 367 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 368 } 369 } 370 } 371 } 372 373 //------------------------------------------------------------------------------ 374 // 3D transpose tensor contract y 375 //------------------------------------------------------------------------------ 376 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 377 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 378 CeedScalar r_B[Q_1D]; 379 for (CeedInt i = 0; i < Q_1D; i++) { 380 r_B[i] = B[data.t_id_y + i * P_1D]; 381 } 382 383 for (CeedInt k = 0; k < P_1D; k++) { 384 __syncthreads(); 385 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 386 __syncthreads(); 387 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 388 for (CeedInt i = 0; i < Q_1D; i++) { 389 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 390 } 391 } 392 } 393 } 394 395 //------------------------------------------------------------------------------ 396 // 3D transpose tensor contract x 397 //------------------------------------------------------------------------------ 398 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 399 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 400 CeedScalar r_B[Q_1D]; 401 for (CeedInt i = 0; i < Q_1D; i++) { 402 r_B[i] = B[data.t_id_x + i * P_1D]; 403 } 404 405 for (CeedInt k = 0; k < P_1D; k++) { 406 __syncthreads(); 407 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 408 __syncthreads(); 409 V[k] = 0.0; 410 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 411 for (CeedInt i = 0; i < Q_1D; i++) { 412 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 413 } 414 } 415 } 416 } 417 418 //------------------------------------------------------------------------------ 419 // 3D transpose tensor contract add x 420 //------------------------------------------------------------------------------ 421 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 422 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 423 CeedScalar r_B[Q_1D]; 424 for (CeedInt i = 0; i < Q_1D; i++) { 425 r_B[i] = B[data.t_id_x + i * P_1D]; 426 } 427 428 for (CeedInt k = 0; k < P_1D; k++) { 429 __syncthreads(); 430 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 431 __syncthreads(); 432 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 433 for (CeedInt i = 0; i < Q_1D; i++) { 434 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 435 } 436 } 437 } 438 } 439 440 //------------------------------------------------------------------------------ 441 // 3D interpolate to quadrature points 442 //------------------------------------------------------------------------------ 443 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 444 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 445 CeedScalar *__restrict__ r_V) { 446 CeedScalar r_t1[T_1D]; 447 CeedScalar r_t2[T_1D]; 448 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 449 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 450 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 451 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 452 } 453 } 454 455 //------------------------------------------------------------------------------ 456 // 3D interpolate transpose 457 //------------------------------------------------------------------------------ 458 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 459 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 460 CeedScalar *__restrict__ r_V) { 461 CeedScalar r_t1[T_1D]; 462 CeedScalar r_t2[T_1D]; 463 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 464 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 465 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 466 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 467 } 468 } 469 470 //------------------------------------------------------------------------------ 471 // 3D derivatives at quadrature points 472 //------------------------------------------------------------------------------ 473 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 474 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 475 CeedScalar *__restrict__ r_V) { 476 CeedScalar r_t1[T_1D]; 477 CeedScalar r_t2[T_1D]; 478 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 479 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 480 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 481 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 482 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 483 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 484 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 485 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 486 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 487 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 488 } 489 } 490 491 //------------------------------------------------------------------------------ 492 // 3D derivatives transpose 493 //------------------------------------------------------------------------------ 494 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 495 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 496 CeedScalar *__restrict__ r_V) { 497 CeedScalar r_t1[T_1D]; 498 CeedScalar r_t2[T_1D]; 499 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 500 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 501 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 502 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 503 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 504 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 505 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 506 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 507 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 508 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 509 } 510 } 511 512 //------------------------------------------------------------------------------ 513 // 3D derivatives at quadrature points 514 //------------------------------------------------------------------------------ 515 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 516 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 517 CeedScalar *__restrict__ r_V) { 518 CeedScalar r_t1[T_1D]; 519 CeedScalar r_t2[T_1D]; 520 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 521 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 522 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 523 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 524 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 525 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 526 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 527 } 528 } 529 530 //------------------------------------------------------------------------------ 531 // 3D derivatives transpose 532 //------------------------------------------------------------------------------ 533 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 534 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 535 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 536 CeedScalar r_t1[T_1D]; 537 CeedScalar r_t2[T_1D]; 538 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 539 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 540 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 541 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 542 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 543 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 544 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 545 } 546 } 547 548 //------------------------------------------------------------------------------ 549 // 3D derivatives at quadrature points, nodes and quadrature points collocated 550 //------------------------------------------------------------------------------ 551 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 552 inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 553 CeedScalar *__restrict__ r_V) { 554 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 555 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 556 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 557 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 558 } 559 } 560 561 //------------------------------------------------------------------------------ 562 // 3D derivatives transpose, nodes and quadrature points collocated 563 //------------------------------------------------------------------------------ 564 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 565 inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 566 CeedScalar *__restrict__ r_V) { 567 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 568 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 569 ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 570 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]); 571 } 572 } 573 574 //------------------------------------------------------------------------------ 575 // 3D quadrature weights 576 //------------------------------------------------------------------------------ 577 template <int P_1D, int Q_1D> 578 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 579 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 580 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 581 for (CeedInt q = 0; q < Q_1D; q++) { 582 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 583 } 584 } 585