1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D> 20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 21 data.slice[data.t_id_x] = *U; 22 __syncthreads(); 23 *V = 0.0; 24 if (data.t_id_x < Q_1D) { 25 for (CeedInt i = 0; i < P_1D; i++) { 26 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 27 } 28 } 29 __syncthreads(); 30 } 31 32 //------------------------------------------------------------------------------ 33 // 1D transpose tensor contraction x 34 //------------------------------------------------------------------------------ 35 template <int NUM_COMP, int P_1D, int Q_1D> 36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 37 data.slice[data.t_id_x] = *U; 38 __syncthreads(); 39 *V = 0.0; 40 if (data.t_id_x < P_1D) { 41 for (CeedInt i = 0; i < Q_1D; i++) { 42 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 43 } 44 } 45 __syncthreads(); 46 } 47 48 //------------------------------------------------------------------------------ 49 // 1D interpolate to quadrature points 50 //------------------------------------------------------------------------------ 51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 // 1D interpolate transpose 60 //------------------------------------------------------------------------------ 61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 63 CeedScalar *__restrict__ r_V) { 64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D derivatives at quadrature points 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 73 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 74 CeedScalar *__restrict__ r_V) { 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 77 } 78 } 79 80 //------------------------------------------------------------------------------ 81 // 1D derivatives transpose 82 //------------------------------------------------------------------------------ 83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 84 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 85 CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D quadrature weights 93 //------------------------------------------------------------------------------ 94 template <int P_1D, int Q_1D> 95 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 96 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 97 } 98 99 //------------------------------------------------------------------------------ 100 // 2D 101 //------------------------------------------------------------------------------ 102 103 //------------------------------------------------------------------------------ 104 // 2D tensor contraction x 105 //------------------------------------------------------------------------------ 106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 107 inline __device__ void ContractX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 108 CeedScalar *V) { 109 data.slice[t_id_x + t_id_y * T_1D] = *U; 110 __syncthreads(); 111 *V = 0.0; 112 if (t_id_x < Q_1D && t_id_y < P_1D) { 113 for (CeedInt i = 0; i < P_1D; i++) { 114 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 115 } 116 } 117 __syncthreads(); 118 } 119 120 //------------------------------------------------------------------------------ 121 // 2D tensor contract y 122 //------------------------------------------------------------------------------ 123 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 124 inline __device__ void ContractY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 125 CeedScalar *V) { 126 data.slice[t_id_x + t_id_y * T_1D] = *U; 127 __syncthreads(); 128 *V = 0.0; 129 if (t_id_x < Q_1D && t_id_y < Q_1D) { 130 for (CeedInt i = 0; i < P_1D; i++) { 131 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 132 } 133 } 134 __syncthreads(); 135 } 136 137 //------------------------------------------------------------------------------ 138 // 2D transpose tensor contract y 139 //------------------------------------------------------------------------------ 140 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 141 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 142 CeedScalar *V) { 143 data.slice[t_id_x + t_id_y * T_1D] = *U; 144 __syncthreads(); 145 *V = 0.0; 146 if (t_id_x < Q_1D && t_id_y < P_1D) { 147 for (CeedInt i = 0; i < Q_1D; i++) { 148 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 149 } 150 } 151 __syncthreads(); 152 } 153 154 //------------------------------------------------------------------------------ 155 // 2D transpose tensor contract x 156 //------------------------------------------------------------------------------ 157 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 158 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 159 CeedScalar *V) { 160 data.slice[t_id_x + t_id_y * T_1D] = *U; 161 __syncthreads(); 162 *V = 0.0; 163 if (t_id_x < P_1D && t_id_y < P_1D) { 164 for (CeedInt i = 0; i < Q_1D; i++) { 165 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 166 } 167 } 168 __syncthreads(); 169 } 170 171 //------------------------------------------------------------------------------ 172 // 2D transpose tensor contract and add x 173 //------------------------------------------------------------------------------ 174 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 175 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 176 CeedScalar *V) { 177 data.slice[t_id_x + t_id_y * T_1D] = *U; 178 __syncthreads(); 179 if (t_id_x < P_1D && t_id_y < P_1D) { 180 for (CeedInt i = 0; i < Q_1D; i++) { 181 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 182 } 183 } 184 __syncthreads(); 185 } 186 187 //------------------------------------------------------------------------------ 188 // 2D interpolate to quadrature points 189 //------------------------------------------------------------------------------ 190 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 191 inline __device__ void InterpTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 192 const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 193 CeedScalar r_t[1]; 194 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 195 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 196 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 197 } 198 } 199 200 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 201 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 202 CeedScalar *__restrict__ r_V) { 203 InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 204 } 205 206 //------------------------------------------------------------------------------ 207 // 2D interpolate transpose 208 //------------------------------------------------------------------------------ 209 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 210 inline __device__ void InterpTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 211 const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 212 CeedScalar r_t[1]; 213 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 214 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 215 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 216 } 217 } 218 219 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 220 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 221 CeedScalar *__restrict__ r_V) { 222 InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 223 } 224 225 //------------------------------------------------------------------------------ 226 // 2D derivatives at quadrature points 227 //------------------------------------------------------------------------------ 228 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 229 inline __device__ void GradTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 230 const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 231 CeedScalar r_t[1]; 232 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 233 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 234 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 235 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 236 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 237 } 238 } 239 240 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 241 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 242 CeedScalar *__restrict__ r_V) { 243 GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 244 } 245 246 //------------------------------------------------------------------------------ 247 // 2D derivatives transpose 248 //------------------------------------------------------------------------------ 249 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 250 inline __device__ void GradTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 251 const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 252 CeedScalar r_t[1]; 253 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 254 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 255 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 256 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 257 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 258 } 259 } 260 261 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 262 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 263 CeedScalar *__restrict__ r_V) { 264 GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 265 } 266 267 //------------------------------------------------------------------------------ 268 // 2D quadrature weights 269 //------------------------------------------------------------------------------ 270 template <int Q_1D> 271 inline __device__ void WeightTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d, 272 CeedScalar *w) { 273 *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 274 } 275 276 template <int P_1D, int Q_1D> 277 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 278 WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w); 279 } 280 281 //------------------------------------------------------------------------------ 282 // 3D 283 //------------------------------------------------------------------------------ 284 285 //------------------------------------------------------------------------------ 286 // 3D tensor contract x 287 //------------------------------------------------------------------------------ 288 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 289 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 290 CeedScalar r_B[P_1D]; 291 for (CeedInt i = 0; i < P_1D; i++) { 292 r_B[i] = B[i + data.t_id_x * P_1D]; 293 } 294 295 for (CeedInt k = 0; k < P_1D; k++) { 296 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 297 __syncthreads(); 298 V[k] = 0.0; 299 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 300 for (CeedInt i = 0; i < P_1D; i++) { 301 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 302 } 303 } 304 __syncthreads(); 305 } 306 } 307 308 //------------------------------------------------------------------------------ 309 // 3D tensor contract y 310 //------------------------------------------------------------------------------ 311 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 312 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 313 CeedScalar r_B[P_1D]; 314 for (CeedInt i = 0; i < P_1D; i++) { 315 r_B[i] = B[i + data.t_id_y * P_1D]; 316 } 317 318 for (CeedInt k = 0; k < P_1D; k++) { 319 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 320 __syncthreads(); 321 V[k] = 0.0; 322 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 323 for (CeedInt i = 0; i < P_1D; i++) { 324 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 325 } 326 } 327 __syncthreads(); 328 } 329 } 330 331 //------------------------------------------------------------------------------ 332 // 3D tensor contract z 333 //------------------------------------------------------------------------------ 334 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 335 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 336 for (CeedInt k = 0; k < Q_1D; k++) { 337 V[k] = 0.0; 338 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 339 for (CeedInt i = 0; i < P_1D; i++) { 340 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 341 } 342 } 343 } 344 } 345 346 //------------------------------------------------------------------------------ 347 // 3D transpose tensor contract z 348 //------------------------------------------------------------------------------ 349 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 350 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 351 for (CeedInt k = 0; k < P_1D; k++) { 352 V[k] = 0.0; 353 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 354 for (CeedInt i = 0; i < Q_1D; i++) { 355 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 356 } 357 } 358 } 359 } 360 361 //------------------------------------------------------------------------------ 362 // 3D transpose tensor contract y 363 //------------------------------------------------------------------------------ 364 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 365 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 366 CeedScalar r_B[Q_1D]; 367 for (CeedInt i = 0; i < Q_1D; i++) { 368 r_B[i] = B[data.t_id_y + i * P_1D]; 369 } 370 371 for (CeedInt k = 0; k < P_1D; k++) { 372 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 373 __syncthreads(); 374 V[k] = 0.0; 375 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 376 for (CeedInt i = 0; i < Q_1D; i++) { 377 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 378 } 379 } 380 __syncthreads(); 381 } 382 } 383 384 //------------------------------------------------------------------------------ 385 // 3D transpose tensor contract y 386 //------------------------------------------------------------------------------ 387 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 388 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 389 CeedScalar r_B[Q_1D]; 390 for (CeedInt i = 0; i < Q_1D; i++) { 391 r_B[i] = B[data.t_id_y + i * P_1D]; 392 } 393 394 for (CeedInt k = 0; k < P_1D; k++) { 395 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 396 __syncthreads(); 397 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 398 for (CeedInt i = 0; i < Q_1D; i++) { 399 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 400 } 401 } 402 __syncthreads(); 403 } 404 } 405 406 //------------------------------------------------------------------------------ 407 // 3D transpose tensor contract x 408 //------------------------------------------------------------------------------ 409 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 410 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 411 CeedScalar r_B[Q_1D]; 412 for (CeedInt i = 0; i < Q_1D; i++) { 413 r_B[i] = B[data.t_id_x + i * P_1D]; 414 } 415 416 for (CeedInt k = 0; k < P_1D; k++) { 417 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 418 __syncthreads(); 419 V[k] = 0.0; 420 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 421 for (CeedInt i = 0; i < Q_1D; i++) { 422 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 423 } 424 } 425 __syncthreads(); 426 } 427 } 428 429 //------------------------------------------------------------------------------ 430 // 3D transpose tensor contract add x 431 //------------------------------------------------------------------------------ 432 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 433 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 434 CeedScalar r_B[Q_1D]; 435 for (CeedInt i = 0; i < Q_1D; i++) { 436 r_B[i] = B[data.t_id_x + i * P_1D]; 437 } 438 439 for (CeedInt k = 0; k < P_1D; k++) { 440 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 441 __syncthreads(); 442 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 443 for (CeedInt i = 0; i < Q_1D; i++) { 444 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 445 } 446 } 447 __syncthreads(); 448 } 449 } 450 451 //------------------------------------------------------------------------------ 452 // 3D interpolate to quadrature points 453 //------------------------------------------------------------------------------ 454 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 455 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 456 CeedScalar *__restrict__ r_V) { 457 CeedScalar r_t1[T_1D]; 458 CeedScalar r_t2[T_1D]; 459 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 460 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 461 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 462 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 463 } 464 } 465 466 //------------------------------------------------------------------------------ 467 // 3D interpolate transpose 468 //------------------------------------------------------------------------------ 469 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 470 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 471 CeedScalar *__restrict__ r_V) { 472 CeedScalar r_t1[T_1D]; 473 CeedScalar r_t2[T_1D]; 474 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 475 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 476 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 477 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 478 } 479 } 480 481 //------------------------------------------------------------------------------ 482 // 3D derivatives at quadrature points 483 //------------------------------------------------------------------------------ 484 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 485 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 486 CeedScalar *__restrict__ r_V) { 487 CeedScalar r_t1[T_1D]; 488 CeedScalar r_t2[T_1D]; 489 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 490 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 491 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 492 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 493 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 494 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 495 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 496 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 497 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 498 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 499 } 500 } 501 502 //------------------------------------------------------------------------------ 503 // 3D derivatives transpose 504 //------------------------------------------------------------------------------ 505 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 506 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 507 CeedScalar *__restrict__ r_V) { 508 CeedScalar r_t1[T_1D]; 509 CeedScalar r_t2[T_1D]; 510 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 511 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 512 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 513 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 514 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 515 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 516 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 517 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 518 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 519 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 520 } 521 } 522 523 //------------------------------------------------------------------------------ 524 // 3D derivatives at quadrature points 525 //------------------------------------------------------------------------------ 526 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 527 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 528 CeedScalar *__restrict__ r_V) { 529 CeedScalar r_t1[T_1D]; 530 CeedScalar r_t2[T_1D]; 531 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 532 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 533 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 534 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 535 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 536 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 537 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 538 } 539 } 540 541 //------------------------------------------------------------------------------ 542 // 3D derivatives transpose 543 //------------------------------------------------------------------------------ 544 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 545 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 546 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 547 CeedScalar r_t1[T_1D]; 548 CeedScalar r_t2[T_1D]; 549 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 550 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 551 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 552 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 553 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 554 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 555 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 556 } 557 } 558 559 //------------------------------------------------------------------------------ 560 // 3D quadrature weights 561 //------------------------------------------------------------------------------ 562 template <int P_1D, int Q_1D> 563 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 564 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 565 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 566 for (CeedInt q = 0; q < Q_1D; q++) { 567 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 568 } 569 } 570