1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis templates 10 #ifndef _ceed_cuda_shared_basis_tensor_templates_h 11 #define _ceed_cuda_shared_basis_tensor_templates_h 12 13 #include <ceed.h> 14 15 //------------------------------------------------------------------------------ 16 // 1D 17 //------------------------------------------------------------------------------ 18 19 //------------------------------------------------------------------------------ 20 // 1D tensor contraction x 21 //------------------------------------------------------------------------------ 22 template <int NUM_COMP, int P_1D, int Q_1D> 23 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 24 data.slice[data.t_id_x] = *U; 25 __syncthreads(); 26 *V = 0.0; 27 if (data.t_id_x < Q_1D) { 28 for (CeedInt i = 0; i < P_1D; i++) { 29 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 30 } 31 } 32 __syncthreads(); 33 } 34 35 //------------------------------------------------------------------------------ 36 // 1D transpose tensor contraction x 37 //------------------------------------------------------------------------------ 38 template <int NUM_COMP, int P_1D, int Q_1D> 39 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 40 data.slice[data.t_id_x] = *U; 41 __syncthreads(); 42 *V = 0.0; 43 if (data.t_id_x < P_1D) { 44 for (CeedInt i = 0; i < Q_1D; i++) { 45 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 46 } 47 } 48 __syncthreads(); 49 } 50 51 //------------------------------------------------------------------------------ 52 // 1D interpolate to quadrature points 53 //------------------------------------------------------------------------------ 54 template <int NUM_COMP, int P_1D, int Q_1D> 55 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 56 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 57 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp); 58 } 59 } 60 61 //------------------------------------------------------------------------------ 62 // 1D interpolate transpose 63 //------------------------------------------------------------------------------ 64 template <int NUM_COMP, int P_1D, int Q_1D> 65 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 66 CeedScalar *__restrict__ r_V) { 67 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 68 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_V + comp); 69 } 70 } 71 72 //------------------------------------------------------------------------------ 73 // 1D derivatives at quadrature points 74 //------------------------------------------------------------------------------ 75 template <int NUM_COMP, int P_1D, int Q_1D> 76 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 77 CeedScalar *__restrict__ r_V) { 78 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 79 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp); 80 } 81 } 82 83 //------------------------------------------------------------------------------ 84 // 1D derivatives transpose 85 //------------------------------------------------------------------------------ 86 template <int NUM_COMP, int P_1D, int Q_1D> 87 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 88 CeedScalar *__restrict__ r_V) { 89 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 90 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_V + comp); 91 } 92 } 93 94 //------------------------------------------------------------------------------ 95 // 1D quadrature weights 96 //------------------------------------------------------------------------------ 97 template <int Q_1D> 98 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 99 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 100 } 101 102 //------------------------------------------------------------------------------ 103 // 2D 104 //------------------------------------------------------------------------------ 105 106 //------------------------------------------------------------------------------ 107 // 2D tensor contraction x 108 //------------------------------------------------------------------------------ 109 template <int NUM_COMP, int P_1D, int Q_1D> 110 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 111 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 112 __syncthreads(); 113 *V = 0.0; 114 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 115 for (CeedInt i = 0; i < P_1D; i++) { 116 *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 117 } 118 } 119 __syncthreads(); 120 } 121 122 //------------------------------------------------------------------------------ 123 // 2D tensor contract y 124 //------------------------------------------------------------------------------ 125 template <int NUM_COMP, int P_1D, int Q_1D> 126 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 127 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 128 __syncthreads(); 129 *V = 0.0; 130 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 131 for (CeedInt i = 0; i < P_1D; i++) { 132 *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 133 } 134 } 135 __syncthreads(); 136 } 137 138 //------------------------------------------------------------------------------ 139 // 2D transpose tensor contract y 140 //------------------------------------------------------------------------------ 141 template <int NUM_COMP, int P_1D, int Q_1D> 142 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 143 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 144 __syncthreads(); 145 *V = 0.0; 146 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 147 for (CeedInt i = 0; i < Q_1D; i++) { 148 *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 149 } 150 } 151 __syncthreads(); 152 } 153 154 //------------------------------------------------------------------------------ 155 // 2D transpose tensor contract x 156 //------------------------------------------------------------------------------ 157 template <int NUM_COMP, int P_1D, int Q_1D> 158 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 159 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 160 __syncthreads(); 161 *V = 0.0; 162 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 163 for (CeedInt i = 0; i < Q_1D; i++) { 164 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 165 } 166 } 167 __syncthreads(); 168 } 169 170 //------------------------------------------------------------------------------ 171 // 2D transpose tensor contract and add x 172 //------------------------------------------------------------------------------ 173 template <int NUM_COMP, int P_1D, int Q_1D> 174 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 175 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; 176 __syncthreads(); 177 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 178 for (CeedInt i = 0; i < Q_1D; i++) { 179 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 180 } 181 } 182 __syncthreads(); 183 } 184 185 //------------------------------------------------------------------------------ 186 // 2D interpolate to quadrature points 187 //------------------------------------------------------------------------------ 188 template <int NUM_COMP, int P_1D, int Q_1D> 189 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 190 CeedScalar *__restrict__ r_V) { 191 CeedScalar r_t[1]; 192 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 193 ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t); 194 ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp); 195 } 196 } 197 198 //------------------------------------------------------------------------------ 199 // 2D interpolate transpose 200 //------------------------------------------------------------------------------ 201 template <int NUM_COMP, int P_1D, int Q_1D> 202 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 203 CeedScalar *__restrict__ r_V) { 204 CeedScalar r_t[1]; 205 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 206 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t); 207 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp); 208 } 209 } 210 211 //------------------------------------------------------------------------------ 212 // 2D derivatives at quadrature points 213 //------------------------------------------------------------------------------ 214 template <int NUM_COMP, int P_1D, int Q_1D> 215 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 216 CeedScalar *__restrict__ r_V) { 217 CeedScalar r_t[1]; 218 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 219 ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_G, r_t); 220 ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp + 0 * NUM_COMP); 221 ContractX2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp, c_B, r_t); 222 ContractY2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp + 1 * NUM_COMP); 223 } 224 } 225 226 //------------------------------------------------------------------------------ 227 // 2D derivatives transpose 228 //------------------------------------------------------------------------------ 229 template <int NUM_COMP, int P_1D, int Q_1D> 230 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 231 CeedScalar *__restrict__ r_V) { 232 CeedScalar r_t[1]; 233 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 234 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 0 * NUM_COMP, c_B, r_t); 235 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_G, r_V + comp); 236 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp + 1 * NUM_COMP, c_G, r_t); 237 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, r_t, c_B, r_V + comp); 238 } 239 } 240 241 //------------------------------------------------------------------------------ 242 // 2D quadrature weights 243 //------------------------------------------------------------------------------ 244 template <int Q_1D> 245 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 246 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 247 } 248 249 //------------------------------------------------------------------------------ 250 // 3D 251 //------------------------------------------------------------------------------ 252 253 //------------------------------------------------------------------------------ 254 // 3D tensor contract x 255 //------------------------------------------------------------------------------ 256 template <int NUM_COMP, int P_1D, int Q_1D> 257 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 258 CeedScalar r_B[P_1D]; 259 for (CeedInt i = 0; i < P_1D; i++) { 260 r_B[i] = B[i + data.t_id_x * P_1D]; 261 } 262 263 for (CeedInt k = 0; k < P_1D; k++) { 264 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 265 __syncthreads(); 266 V[k] = 0.0; 267 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 268 for (CeedInt i = 0; i < P_1D; i++) { 269 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 270 } 271 } 272 __syncthreads(); 273 } 274 } 275 276 //------------------------------------------------------------------------------ 277 // 3D tensor contract y 278 //------------------------------------------------------------------------------ 279 template <int NUM_COMP, int P_1D, int Q_1D> 280 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 281 CeedScalar r_B[P_1D]; 282 for (CeedInt i = 0; i < P_1D; i++) { 283 r_B[i] = B[i + data.t_id_y * P_1D]; 284 } 285 286 for (CeedInt k = 0; k < P_1D; k++) { 287 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 288 __syncthreads(); 289 V[k] = 0.0; 290 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 291 for (CeedInt i = 0; i < P_1D; i++) { 292 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 293 } 294 } 295 __syncthreads(); 296 } 297 } 298 299 //------------------------------------------------------------------------------ 300 // 3D tensor contract z 301 //------------------------------------------------------------------------------ 302 template <int NUM_COMP, int P_1D, int Q_1D> 303 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 304 for (CeedInt k = 0; k < Q_1D; k++) { 305 V[k] = 0.0; 306 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 307 for (CeedInt i = 0; i < P_1D; i++) { 308 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 309 } 310 } 311 } 312 } 313 314 //------------------------------------------------------------------------------ 315 // 3D transpose tensor contract z 316 //------------------------------------------------------------------------------ 317 template <int NUM_COMP, int P_1D, int Q_1D> 318 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 319 for (CeedInt k = 0; k < P_1D; k++) { 320 V[k] = 0.0; 321 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 322 for (CeedInt i = 0; i < Q_1D; i++) { 323 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 324 } 325 } 326 } 327 } 328 329 //------------------------------------------------------------------------------ 330 // 3D transpose tensor contract y 331 //------------------------------------------------------------------------------ 332 template <int NUM_COMP, int P_1D, int Q_1D> 333 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 334 CeedScalar r_B[Q_1D]; 335 for (CeedInt i = 0; i < Q_1D; i++) { 336 r_B[i] = B[data.t_id_y + i * P_1D]; 337 } 338 339 for (CeedInt k = 0; k < P_1D; k++) { 340 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 341 __syncthreads(); 342 V[k] = 0.0; 343 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 344 for (CeedInt i = 0; i < Q_1D; i++) { 345 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 346 } 347 } 348 __syncthreads(); 349 } 350 } 351 352 //------------------------------------------------------------------------------ 353 // 3D transpose tensor contract y 354 //------------------------------------------------------------------------------ 355 template <int NUM_COMP, int P_1D, int Q_1D> 356 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 357 CeedScalar r_B[Q_1D]; 358 for (CeedInt i = 0; i < Q_1D; i++) { 359 r_B[i] = B[data.t_id_y + i * P_1D]; 360 } 361 362 for (CeedInt k = 0; k < P_1D; k++) { 363 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 364 __syncthreads(); 365 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 366 for (CeedInt i = 0; i < Q_1D; i++) { 367 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 368 } 369 } 370 __syncthreads(); 371 } 372 } 373 374 //------------------------------------------------------------------------------ 375 // 3D transpose tensor contract x 376 //------------------------------------------------------------------------------ 377 template <int NUM_COMP, int P_1D, int Q_1D> 378 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 379 CeedScalar r_B[Q_1D]; 380 for (CeedInt i = 0; i < Q_1D; i++) { 381 r_B[i] = B[data.t_id_x + i * P_1D]; 382 } 383 384 for (CeedInt k = 0; k < P_1D; k++) { 385 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 386 __syncthreads(); 387 V[k] = 0.0; 388 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 389 for (CeedInt i = 0; i < Q_1D; i++) { 390 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 391 } 392 } 393 __syncthreads(); 394 } 395 } 396 397 //------------------------------------------------------------------------------ 398 // 3D transpose tensor contract add x 399 //------------------------------------------------------------------------------ 400 template <int NUM_COMP, int P_1D, int Q_1D> 401 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 402 CeedScalar r_B[Q_1D]; 403 for (CeedInt i = 0; i < Q_1D; i++) { 404 r_B[i] = B[data.t_id_x + i * P_1D]; 405 } 406 407 for (CeedInt k = 0; k < P_1D; k++) { 408 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 409 __syncthreads(); 410 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 411 for (CeedInt i = 0; i < Q_1D; i++) { 412 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 413 } 414 } 415 __syncthreads(); 416 } 417 } 418 419 //------------------------------------------------------------------------------ 420 // 3D interpolate to quadrature points 421 //------------------------------------------------------------------------------ 422 template <int NUM_COMP, int P_1D, int Q_1D> 423 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 424 CeedScalar *__restrict__ r_V) { 425 CeedScalar r_t1[T_1D]; 426 CeedScalar r_t2[T_1D]; 427 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 428 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1); 429 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 430 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D); 431 } 432 } 433 434 //------------------------------------------------------------------------------ 435 // 3D interpolate transpose 436 //------------------------------------------------------------------------------ 437 template <int NUM_COMP, int P_1D, int Q_1D> 438 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 439 CeedScalar *__restrict__ r_V) { 440 CeedScalar r_t1[T_1D]; 441 CeedScalar r_t2[T_1D]; 442 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 443 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D, c_B, r_t1); 444 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 445 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D); 446 } 447 } 448 449 //------------------------------------------------------------------------------ 450 // 3D derivatives at quadrature points 451 //------------------------------------------------------------------------------ 452 template <int NUM_COMP, int P_1D, int Q_1D> 453 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 454 CeedScalar *__restrict__ r_V) { 455 CeedScalar r_t1[T_1D]; 456 CeedScalar r_t2[T_1D]; 457 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 458 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_G, r_t1); 459 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 460 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); 461 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1); 462 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2); 463 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); 464 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1); 465 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 466 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); 467 } 468 } 469 470 //------------------------------------------------------------------------------ 471 // 3D derivatives transpose 472 //------------------------------------------------------------------------------ 473 template <int NUM_COMP, int P_1D, int Q_1D> 474 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 475 CeedScalar *__restrict__ r_V) { 476 CeedScalar r_t1[T_1D]; 477 CeedScalar r_t2[T_1D]; 478 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 479 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_B, r_t1); 480 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 481 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_G, r_V + comp * P_1D); 482 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_B, r_t1); 483 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_G, r_t2); 484 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D); 485 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t1); 486 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 487 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D); 488 } 489 } 490 491 //------------------------------------------------------------------------------ 492 // 3D derivatives at quadrature points 493 //------------------------------------------------------------------------------ 494 template <int NUM_COMP, int P_1D, int Q_1D> 495 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 496 CeedScalar *__restrict__ r_V) { 497 CeedScalar r_t1[T_1D]; 498 CeedScalar r_t2[T_1D]; 499 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 500 ContractX3d<NUM_COMP, P_1D, Q_1D>(data, r_U + comp * P_1D, c_B, r_t1); 501 ContractY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 502 ContractZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1); 503 ContractX3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 0 * NUM_COMP * Q_1D); 504 ContractY3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 1 * NUM_COMP * Q_1D); 505 ContractZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_t1, c_G, r_V + comp * Q_1D + 2 * NUM_COMP * Q_1D); 506 } 507 } 508 509 //------------------------------------------------------------------------------ 510 // 3D derivatives transpose 511 //------------------------------------------------------------------------------ 512 template <int NUM_COMP, int P_1D, int Q_1D> 513 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 514 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 515 CeedScalar r_t1[T_1D]; 516 CeedScalar r_t2[T_1D]; 517 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 518 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 2 * NUM_COMP * Q_1D, c_G, r_t2); 519 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 1 * NUM_COMP * Q_1D, c_G, r_t2); 520 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D>(data, r_U + comp * Q_1D + 0 * NUM_COMP * Q_1D, c_G, r_t2); 521 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_t1); 522 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D>(data, r_t1, c_B, r_t2); 523 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D>(data, r_t2, c_B, r_V + comp * P_1D); 524 } 525 } 526 527 //------------------------------------------------------------------------------ 528 // 3D quadrature weights 529 //------------------------------------------------------------------------------ 530 template <int Q_1D> 531 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 532 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 533 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 534 for (CeedInt q = 0; q < Q_1D; q++) { 535 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 536 } 537 } 538 539 //------------------------------------------------------------------------------ 540 541 #endif 542