1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis templates 10 11 #include <ceed/types.h> 12 13 //------------------------------------------------------------------------------ 14 // 1D 15 //------------------------------------------------------------------------------ 16 17 //------------------------------------------------------------------------------ 18 // 1D tensor contraction x 19 //------------------------------------------------------------------------------ 20 template <int NUM_COMP, int P_1D, int Q_1D> 21 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 22 data.slice[data.t_id_x] = *U; 23 __syncthreads(); 24 *V = 0.0; 25 if (data.t_id_x < Q_1D) { 26 for (CeedInt i = 0; i < P_1D; i++) { 27 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 28 } 29 } 30 __syncthreads(); 31 } 32 33 //------------------------------------------------------------------------------ 34 // 1D transpose tensor contraction x 35 //------------------------------------------------------------------------------ 36 template <int NUM_COMP, int P_1D, int Q_1D> 37 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 38 data.slice[data.t_id_x] = *U; 39 __syncthreads(); 40 *V = 0.0; 41 if (data.t_id_x < P_1D) { 42 for (CeedInt i = 0; i < Q_1D; i++) { 43 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 44 } 45 } 46 __syncthreads(); 47 } 48 49 //------------------------------------------------------------------------------ 50 // 1D interpolate to quadrature points 51 //------------------------------------------------------------------------------ 52 template <int NUM_COMP, int P_1D, int Q_1D> 53 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 54 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 55 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 56 } 57 } 58 59 //------------------------------------------------------------------------------ 60 // 1D interpolate transpose 61 //------------------------------------------------------------------------------ 62 template <int NUM_COMP, int P_1D, int Q_1D> 63 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 64 CeedScalar *__restrict__ r_V) { 65 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 66 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 67 } 68 } 69 70 //------------------------------------------------------------------------------ 71 // 1D derivatives at quadrature points 72 //------------------------------------------------------------------------------ 73 template <int NUM_COMP, int P_1D, int Q_1D> 74 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 75 CeedScalar *__restrict__ r_V) { 76 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 77 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 78 } 79 } 80 81 //------------------------------------------------------------------------------ 82 // 1D derivatives transpose 83 //------------------------------------------------------------------------------ 84 template <int NUM_COMP, int P_1D, int Q_1D> 85 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 86 CeedScalar *__restrict__ r_V) { 87 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 88 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 89 } 90 } 91 92 //------------------------------------------------------------------------------ 93 // 1D quadrature weights 94 //------------------------------------------------------------------------------ 95 template <int Q_1D> 96 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 97 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 98 } 99 100 //------------------------------------------------------------------------------ 101 // 2D 102 //------------------------------------------------------------------------------ 103 104 //------------------------------------------------------------------------------ 105 // 2D tensor contraction x 106 //------------------------------------------------------------------------------ 107 template <int NUM_COMP, int P_1D, int Q_1D> 108 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 109 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 110 __syncthreads(); 111 *V = 0.0; 112 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 113 for (CeedInt i = 0; i < P_1D; i++) { 114 *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 115 } 116 } 117 __syncthreads(); 118 } 119 120 //------------------------------------------------------------------------------ 121 // 2D tensor contract y 122 //------------------------------------------------------------------------------ 123 template <int NUM_COMP, int P_1D, int Q_1D> 124 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 125 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 126 __syncthreads(); 127 *V = 0.0; 128 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 129 for (CeedInt i = 0; i < P_1D; i++) { 130 *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 131 } 132 } 133 __syncthreads(); 134 } 135 136 //------------------------------------------------------------------------------ 137 // 2D transpose tensor contract y 138 //------------------------------------------------------------------------------ 139 template <int NUM_COMP, int P_1D, int Q_1D> 140 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 141 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 142 __syncthreads(); 143 *V = 0.0; 144 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 145 for (CeedInt i = 0; i < Q_1D; i++) { 146 *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 147 } 148 } 149 __syncthreads(); 150 } 151 152 //------------------------------------------------------------------------------ 153 // 2D transpose tensor contract x 154 //------------------------------------------------------------------------------ 155 template <int NUM_COMP, int P_1D, int Q_1D> 156 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 157 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 158 __syncthreads(); 159 *V = 0.0; 160 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 161 for (CeedInt i = 0; i < Q_1D; i++) { 162 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 163 } 164 } 165 __syncthreads(); 166 } 167 168 //------------------------------------------------------------------------------ 169 // 2D transpose tensor contract and add x 170 //------------------------------------------------------------------------------ 171 template <int NUM_COMP, int P_1D, int Q_1D> 172 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 173 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 174 __syncthreads(); 175 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 176 for (CeedInt i = 0; i < Q_1D; i++) { 177 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 178 } 179 } 180 __syncthreads(); 181 } 182 183 //------------------------------------------------------------------------------ 184 // 2D interpolate to quadrature points 185 //------------------------------------------------------------------------------ 186 template <int NUM_COMP, int P_1D, int Q_1D> 187 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 188 CeedScalar *__restrict__ r_V) { 189 CeedScalar r_t[1]; 190 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 191 ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t); 192 ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]); 193 } 194 } 195 196 //------------------------------------------------------------------------------ 197 // 2D interpolate transpose 198 //------------------------------------------------------------------------------ 199 template <int NUM_COMP, int P_1D, int Q_1D> 200 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 201 CeedScalar *__restrict__ r_V) { 202 CeedScalar r_t[1]; 203 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 204 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t); 205 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]); 206 } 207 } 208 209 //------------------------------------------------------------------------------ 210 // 2D derivatives at quadrature points 211 //------------------------------------------------------------------------------ 212 template <int NUM_COMP, int P_1D, int Q_1D> 213 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 214 CeedScalar *__restrict__ r_V) { 215 CeedScalar r_t[1]; 216 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 217 ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, r_t); 218 ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 219 ContractX2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, r_t); 220 ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 221 } 222 } 223 224 //------------------------------------------------------------------------------ 225 // 2D derivatives transpose 226 //------------------------------------------------------------------------------ 227 template <int NUM_COMP, int P_1D, int Q_1D> 228 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 229 CeedScalar *__restrict__ r_V) { 230 CeedScalar r_t[1]; 231 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 232 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 233 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, &r_V[comp]); 234 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 235 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, &r_V[comp]); 236 } 237 } 238 239 //------------------------------------------------------------------------------ 240 // 2D quadrature weights 241 //------------------------------------------------------------------------------ 242 template <int Q_1D> 243 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 244 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 245 } 246 247 //------------------------------------------------------------------------------ 248 // 3D 249 //------------------------------------------------------------------------------ 250 251 //------------------------------------------------------------------------------ 252 // 3D tensor contract x 253 //------------------------------------------------------------------------------ 254 template <int NUM_COMP, int P_1D, int Q_1D> 255 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 256 CeedScalar r_B[P_1D]; 257 for (CeedInt i = 0; i < P_1D; i++) { 258 r_B[i] = B[i + data.t_id_x * P_1D]; 259 } 260 261 for (CeedInt k = 0; k < P_1D; k++) { 262 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 263 __syncthreads(); 264 V[k] = 0.0; 265 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 266 for (CeedInt i = 0; i < P_1D; i++) { 267 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 268 } 269 } 270 __syncthreads(); 271 } 272 } 273 274 //------------------------------------------------------------------------------ 275 // 3D tensor contract y 276 //------------------------------------------------------------------------------ 277 template <int NUM_COMP, int P_1D, int Q_1D> 278 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 279 CeedScalar r_B[P_1D]; 280 for (CeedInt i = 0; i < P_1D; i++) { 281 r_B[i] = B[i + data.t_id_y * P_1D]; 282 } 283 284 for (CeedInt k = 0; k < P_1D; k++) { 285 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 286 __syncthreads(); 287 V[k] = 0.0; 288 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 289 for (CeedInt i = 0; i < P_1D; i++) { 290 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 291 } 292 } 293 __syncthreads(); 294 } 295 } 296 297 //------------------------------------------------------------------------------ 298 // 3D tensor contract z 299 //------------------------------------------------------------------------------ 300 template <int NUM_COMP, int P_1D, int Q_1D> 301 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 302 for (CeedInt k = 0; k < Q_1D; k++) { 303 V[k] = 0.0; 304 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 305 for (CeedInt i = 0; i < P_1D; i++) { 306 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 307 } 308 } 309 } 310 } 311 312 //------------------------------------------------------------------------------ 313 // 3D transpose tensor contract z 314 //------------------------------------------------------------------------------ 315 template <int NUM_COMP, int P_1D, int Q_1D> 316 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 317 for (CeedInt k = 0; k < P_1D; k++) { 318 V[k] = 0.0; 319 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 320 for (CeedInt i = 0; i < Q_1D; i++) { 321 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 322 } 323 } 324 } 325 } 326 327 //------------------------------------------------------------------------------ 328 // 3D transpose tensor contract y 329 //------------------------------------------------------------------------------ 330 template <int NUM_COMP, int P_1D, int Q_1D> 331 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 332 CeedScalar r_B[Q_1D]; 333 for (CeedInt i = 0; i < Q_1D; i++) { 334 r_B[i] = B[data.t_id_y + i * P_1D]; 335 } 336 337 for (CeedInt k = 0; k < P_1D; k++) { 338 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 339 __syncthreads(); 340 V[k] = 0.0; 341 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 342 for (CeedInt i = 0; i < Q_1D; i++) { 343 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 344 } 345 } 346 __syncthreads(); 347 } 348 } 349 350 //------------------------------------------------------------------------------ 351 // 3D transpose tensor contract y 352 //------------------------------------------------------------------------------ 353 template <int NUM_COMP, int P_1D, int Q_1D> 354 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 355 CeedScalar r_B[Q_1D]; 356 for (CeedInt i = 0; i < Q_1D; i++) { 357 r_B[i] = B[data.t_id_y + i * P_1D]; 358 } 359 360 for (CeedInt k = 0; k < P_1D; k++) { 361 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 362 __syncthreads(); 363 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 364 for (CeedInt i = 0; i < Q_1D; i++) { 365 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 366 } 367 } 368 __syncthreads(); 369 } 370 } 371 372 //------------------------------------------------------------------------------ 373 // 3D transpose tensor contract x 374 //------------------------------------------------------------------------------ 375 template <int NUM_COMP, int P_1D, int Q_1D> 376 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 377 CeedScalar r_B[Q_1D]; 378 for (CeedInt i = 0; i < Q_1D; i++) { 379 r_B[i] = B[data.t_id_x + i * P_1D]; 380 } 381 382 for (CeedInt k = 0; k < P_1D; k++) { 383 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 384 __syncthreads(); 385 V[k] = 0.0; 386 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 387 for (CeedInt i = 0; i < Q_1D; i++) { 388 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 389 } 390 } 391 __syncthreads(); 392 } 393 } 394 395 //------------------------------------------------------------------------------ 396 // 3D transpose tensor contract add x 397 //------------------------------------------------------------------------------ 398 template <int NUM_COMP, int P_1D, int Q_1D> 399 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 400 CeedScalar r_B[Q_1D]; 401 for (CeedInt i = 0; i < Q_1D; i++) { 402 r_B[i] = B[data.t_id_x + i * P_1D]; 403 } 404 405 for (CeedInt k = 0; k < P_1D; k++) { 406 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 407 __syncthreads(); 408 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 409 for (CeedInt i = 0; i < Q_1D; i++) { 410 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 411 } 412 } 413 __syncthreads(); 414 } 415 } 416 417 //------------------------------------------------------------------------------ 418 // 3D interpolate to quadrature points 419 //------------------------------------------------------------------------------ 420 template <int NUM_COMP, int P_1D, int Q_1D> 421 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 422 CeedScalar *__restrict__ r_V) { 423 CeedScalar r_t1[T_1D]; 424 CeedScalar r_t2[T_1D]; 425 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 426 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 427 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 428 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 429 } 430 } 431 432 //------------------------------------------------------------------------------ 433 // 3D interpolate transpose 434 //------------------------------------------------------------------------------ 435 template <int NUM_COMP, int P_1D, int Q_1D> 436 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 437 CeedScalar *__restrict__ r_V) { 438 CeedScalar r_t1[T_1D]; 439 CeedScalar r_t2[T_1D]; 440 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 441 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 442 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 443 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 444 } 445 } 446 447 //------------------------------------------------------------------------------ 448 // 3D derivatives at quadrature points 449 //------------------------------------------------------------------------------ 450 template <int NUM_COMP, int P_1D, int Q_1D> 451 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 452 CeedScalar *__restrict__ r_V) { 453 CeedScalar r_t1[T_1D]; 454 CeedScalar r_t2[T_1D]; 455 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 456 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 457 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 458 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 459 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 460 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2); 461 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 462 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 463 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 464 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 465 } 466 } 467 468 //------------------------------------------------------------------------------ 469 // 3D derivatives transpose 470 //------------------------------------------------------------------------------ 471 template <int NUM_COMP, int P_1D, int Q_1D> 472 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 473 CeedScalar *__restrict__ r_V) { 474 CeedScalar r_t1[T_1D]; 475 CeedScalar r_t2[T_1D]; 476 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 477 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 478 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 479 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 480 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 481 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2); 482 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 483 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 484 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 485 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 486 } 487 } 488 489 //------------------------------------------------------------------------------ 490 // 3D derivatives at quadrature points 491 //------------------------------------------------------------------------------ 492 template <int NUM_COMP, int P_1D, int Q_1D> 493 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 494 CeedScalar *__restrict__ r_V) { 495 CeedScalar r_t1[T_1D]; 496 CeedScalar r_t2[T_1D]; 497 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 498 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 499 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 500 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1); 501 ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 502 ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 503 ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 504 } 505 } 506 507 //------------------------------------------------------------------------------ 508 // 3D derivatives transpose 509 //------------------------------------------------------------------------------ 510 template <int NUM_COMP, int P_1D, int Q_1D> 511 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 512 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 513 CeedScalar r_t1[T_1D]; 514 CeedScalar r_t2[T_1D]; 515 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 516 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 517 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 518 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 519 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1); 520 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 521 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 522 } 523 } 524 525 //------------------------------------------------------------------------------ 526 // 3D quadrature weights 527 //------------------------------------------------------------------------------ 528 template <int Q_1D> 529 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 530 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 531 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 532 for (CeedInt q = 0; q < Q_1D; q++) { 533 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 534 } 535 } 536