1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D> 20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 21 data.slice[data.t_id_x] = *U; 22 __syncthreads(); 23 *V = 0.0; 24 if (data.t_id_x < Q_1D) { 25 for (CeedInt i = 0; i < P_1D; i++) { 26 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 27 } 28 } 29 __syncthreads(); 30 } 31 32 //------------------------------------------------------------------------------ 33 // 1D transpose tensor contraction x 34 //------------------------------------------------------------------------------ 35 template <int NUM_COMP, int P_1D, int Q_1D> 36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 37 data.slice[data.t_id_x] = *U; 38 __syncthreads(); 39 *V = 0.0; 40 if (data.t_id_x < P_1D) { 41 for (CeedInt i = 0; i < Q_1D; i++) { 42 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 43 } 44 } 45 __syncthreads(); 46 } 47 48 //------------------------------------------------------------------------------ 49 // 1D interpolate to quadrature points 50 //------------------------------------------------------------------------------ 51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 // 1D interpolate transpose 60 //------------------------------------------------------------------------------ 61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 63 CeedScalar *__restrict__ r_V) { 64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D derivatives at quadrature points 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 73 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 74 CeedScalar *__restrict__ r_V) { 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 77 } 78 } 79 80 //------------------------------------------------------------------------------ 81 // 1D derivatives transpose 82 //------------------------------------------------------------------------------ 83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 84 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 85 CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D quadrature weights 93 //------------------------------------------------------------------------------ 94 template <int Q_1D> 95 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 96 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 97 } 98 99 //------------------------------------------------------------------------------ 100 // 2D 101 //------------------------------------------------------------------------------ 102 103 //------------------------------------------------------------------------------ 104 // 2D tensor contraction x 105 //------------------------------------------------------------------------------ 106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 107 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 108 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 109 __syncthreads(); 110 *V = 0.0; 111 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 112 for (CeedInt i = 0; i < P_1D; i++) { 113 *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 114 } 115 } 116 __syncthreads(); 117 } 118 119 //------------------------------------------------------------------------------ 120 // 2D tensor contract y 121 //------------------------------------------------------------------------------ 122 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 123 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 124 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 125 __syncthreads(); 126 *V = 0.0; 127 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 128 for (CeedInt i = 0; i < P_1D; i++) { 129 *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 130 } 131 } 132 __syncthreads(); 133 } 134 135 //------------------------------------------------------------------------------ 136 // 2D transpose tensor contract y 137 //------------------------------------------------------------------------------ 138 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 139 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 140 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 141 __syncthreads(); 142 *V = 0.0; 143 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 144 for (CeedInt i = 0; i < Q_1D; i++) { 145 *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 146 } 147 } 148 __syncthreads(); 149 } 150 151 //------------------------------------------------------------------------------ 152 // 2D transpose tensor contract x 153 //------------------------------------------------------------------------------ 154 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 155 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 156 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 157 __syncthreads(); 158 *V = 0.0; 159 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 160 for (CeedInt i = 0; i < Q_1D; i++) { 161 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 162 } 163 } 164 __syncthreads(); 165 } 166 167 //------------------------------------------------------------------------------ 168 // 2D transpose tensor contract and add x 169 //------------------------------------------------------------------------------ 170 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 171 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 172 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 173 __syncthreads(); 174 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 175 for (CeedInt i = 0; i < Q_1D; i++) { 176 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 177 } 178 } 179 __syncthreads(); 180 } 181 182 //------------------------------------------------------------------------------ 183 // 2D interpolate to quadrature points 184 //------------------------------------------------------------------------------ 185 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 186 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 187 CeedScalar *__restrict__ r_V) { 188 CeedScalar r_t[1]; 189 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 190 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 191 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 192 } 193 } 194 195 //------------------------------------------------------------------------------ 196 // 2D interpolate transpose 197 //------------------------------------------------------------------------------ 198 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 199 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 200 CeedScalar *__restrict__ r_V) { 201 CeedScalar r_t[1]; 202 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 203 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 204 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 205 } 206 } 207 208 //------------------------------------------------------------------------------ 209 // 2D derivatives at quadrature points 210 //------------------------------------------------------------------------------ 211 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 212 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 213 CeedScalar *__restrict__ r_V) { 214 CeedScalar r_t[1]; 215 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 216 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t); 217 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 218 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t); 219 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 220 } 221 } 222 223 //------------------------------------------------------------------------------ 224 // 2D derivatives transpose 225 //------------------------------------------------------------------------------ 226 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 227 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 228 CeedScalar *__restrict__ r_V) { 229 CeedScalar r_t[1]; 230 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 231 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 232 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]); 233 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 234 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]); 235 } 236 } 237 238 //------------------------------------------------------------------------------ 239 // 2D quadrature weights 240 //------------------------------------------------------------------------------ 241 template <int Q_1D> 242 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 243 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 244 } 245 246 //------------------------------------------------------------------------------ 247 // 3D 248 //------------------------------------------------------------------------------ 249 250 //------------------------------------------------------------------------------ 251 // 3D tensor contract x 252 //------------------------------------------------------------------------------ 253 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 254 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 255 CeedScalar r_B[P_1D]; 256 for (CeedInt i = 0; i < P_1D; i++) { 257 r_B[i] = B[i + data.t_id_x * P_1D]; 258 } 259 260 for (CeedInt k = 0; k < P_1D; k++) { 261 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 262 __syncthreads(); 263 V[k] = 0.0; 264 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 265 for (CeedInt i = 0; i < P_1D; i++) { 266 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 267 } 268 } 269 __syncthreads(); 270 } 271 } 272 273 //------------------------------------------------------------------------------ 274 // 3D tensor contract y 275 //------------------------------------------------------------------------------ 276 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 277 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 278 CeedScalar r_B[P_1D]; 279 for (CeedInt i = 0; i < P_1D; i++) { 280 r_B[i] = B[i + data.t_id_y * P_1D]; 281 } 282 283 for (CeedInt k = 0; k < P_1D; k++) { 284 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 285 __syncthreads(); 286 V[k] = 0.0; 287 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 288 for (CeedInt i = 0; i < P_1D; i++) { 289 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 290 } 291 } 292 __syncthreads(); 293 } 294 } 295 296 //------------------------------------------------------------------------------ 297 // 3D tensor contract z 298 //------------------------------------------------------------------------------ 299 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 300 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 301 for (CeedInt k = 0; k < Q_1D; k++) { 302 V[k] = 0.0; 303 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 304 for (CeedInt i = 0; i < P_1D; i++) { 305 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 306 } 307 } 308 } 309 } 310 311 //------------------------------------------------------------------------------ 312 // 3D transpose tensor contract z 313 //------------------------------------------------------------------------------ 314 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 315 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 316 for (CeedInt k = 0; k < P_1D; k++) { 317 V[k] = 0.0; 318 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 319 for (CeedInt i = 0; i < Q_1D; i++) { 320 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 321 } 322 } 323 } 324 } 325 326 //------------------------------------------------------------------------------ 327 // 3D transpose tensor contract y 328 //------------------------------------------------------------------------------ 329 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 330 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 331 CeedScalar r_B[Q_1D]; 332 for (CeedInt i = 0; i < Q_1D; i++) { 333 r_B[i] = B[data.t_id_y + i * P_1D]; 334 } 335 336 for (CeedInt k = 0; k < P_1D; k++) { 337 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 338 __syncthreads(); 339 V[k] = 0.0; 340 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 341 for (CeedInt i = 0; i < Q_1D; i++) { 342 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 343 } 344 } 345 __syncthreads(); 346 } 347 } 348 349 //------------------------------------------------------------------------------ 350 // 3D transpose tensor contract y 351 //------------------------------------------------------------------------------ 352 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 353 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 354 CeedScalar r_B[Q_1D]; 355 for (CeedInt i = 0; i < Q_1D; i++) { 356 r_B[i] = B[data.t_id_y + i * P_1D]; 357 } 358 359 for (CeedInt k = 0; k < P_1D; k++) { 360 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 361 __syncthreads(); 362 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 363 for (CeedInt i = 0; i < Q_1D; i++) { 364 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 365 } 366 } 367 __syncthreads(); 368 } 369 } 370 371 //------------------------------------------------------------------------------ 372 // 3D transpose tensor contract x 373 //------------------------------------------------------------------------------ 374 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 375 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 376 CeedScalar r_B[Q_1D]; 377 for (CeedInt i = 0; i < Q_1D; i++) { 378 r_B[i] = B[data.t_id_x + i * P_1D]; 379 } 380 381 for (CeedInt k = 0; k < P_1D; k++) { 382 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 383 __syncthreads(); 384 V[k] = 0.0; 385 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 386 for (CeedInt i = 0; i < Q_1D; i++) { 387 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 388 } 389 } 390 __syncthreads(); 391 } 392 } 393 394 //------------------------------------------------------------------------------ 395 // 3D transpose tensor contract add x 396 //------------------------------------------------------------------------------ 397 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 398 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 399 CeedScalar r_B[Q_1D]; 400 for (CeedInt i = 0; i < Q_1D; i++) { 401 r_B[i] = B[data.t_id_x + i * P_1D]; 402 } 403 404 for (CeedInt k = 0; k < P_1D; k++) { 405 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 406 __syncthreads(); 407 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 408 for (CeedInt i = 0; i < Q_1D; i++) { 409 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 410 } 411 } 412 __syncthreads(); 413 } 414 } 415 416 //------------------------------------------------------------------------------ 417 // 3D interpolate to quadrature points 418 //------------------------------------------------------------------------------ 419 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 420 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 421 CeedScalar *__restrict__ r_V) { 422 CeedScalar r_t1[T_1D]; 423 CeedScalar r_t2[T_1D]; 424 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 425 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 426 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 427 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 428 } 429 } 430 431 //------------------------------------------------------------------------------ 432 // 3D interpolate transpose 433 //------------------------------------------------------------------------------ 434 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 435 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 436 CeedScalar *__restrict__ r_V) { 437 CeedScalar r_t1[T_1D]; 438 CeedScalar r_t2[T_1D]; 439 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 440 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 441 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 442 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 443 } 444 } 445 446 //------------------------------------------------------------------------------ 447 // 3D derivatives at quadrature points 448 //------------------------------------------------------------------------------ 449 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 450 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 451 CeedScalar *__restrict__ r_V) { 452 CeedScalar r_t1[T_1D]; 453 CeedScalar r_t2[T_1D]; 454 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 455 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 456 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 457 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 458 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 459 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 460 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 461 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 462 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 463 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 464 } 465 } 466 467 //------------------------------------------------------------------------------ 468 // 3D derivatives transpose 469 //------------------------------------------------------------------------------ 470 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 471 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 472 CeedScalar *__restrict__ r_V) { 473 CeedScalar r_t1[T_1D]; 474 CeedScalar r_t2[T_1D]; 475 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 476 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 477 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 478 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 479 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 480 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 481 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 482 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 483 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 484 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 485 } 486 } 487 488 //------------------------------------------------------------------------------ 489 // 3D derivatives at quadrature points 490 //------------------------------------------------------------------------------ 491 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 492 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 493 CeedScalar *__restrict__ r_V) { 494 CeedScalar r_t1[T_1D]; 495 CeedScalar r_t2[T_1D]; 496 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 497 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 498 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 499 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 500 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 501 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 502 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 503 } 504 } 505 506 //------------------------------------------------------------------------------ 507 // 3D derivatives transpose 508 //------------------------------------------------------------------------------ 509 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 510 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 511 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 512 CeedScalar r_t1[T_1D]; 513 CeedScalar r_t2[T_1D]; 514 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 515 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 516 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 517 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 518 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 519 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 520 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 521 } 522 } 523 524 //------------------------------------------------------------------------------ 525 // 3D quadrature weights 526 //------------------------------------------------------------------------------ 527 template <int Q_1D> 528 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 529 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 530 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 531 for (CeedInt q = 0; q < Q_1D; q++) { 532 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 533 } 534 } 535