1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D> 20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 21 data.slice[data.t_id_x] = *U; 22 __syncthreads(); 23 *V = 0.0; 24 if (data.t_id_x < Q_1D) { 25 for (CeedInt i = 0; i < P_1D; i++) { 26 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 27 } 28 } 29 __syncthreads(); 30 } 31 32 //------------------------------------------------------------------------------ 33 // 1D transpose tensor contraction x 34 //------------------------------------------------------------------------------ 35 template <int NUM_COMP, int P_1D, int Q_1D> 36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 37 data.slice[data.t_id_x] = *U; 38 __syncthreads(); 39 *V = 0.0; 40 if (data.t_id_x < P_1D) { 41 for (CeedInt i = 0; i < Q_1D; i++) { 42 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 43 } 44 } 45 __syncthreads(); 46 } 47 48 //------------------------------------------------------------------------------ 49 // 1D interpolate to quadrature points 50 //------------------------------------------------------------------------------ 51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 // 1D interpolate transpose 60 //------------------------------------------------------------------------------ 61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 63 CeedScalar *__restrict__ r_V) { 64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D derivatives at quadrature points 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 73 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 74 CeedScalar *__restrict__ r_V) { 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 77 } 78 } 79 80 //------------------------------------------------------------------------------ 81 // 1D derivatives transpose 82 //------------------------------------------------------------------------------ 83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 84 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 85 CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D quadrature weights 93 //------------------------------------------------------------------------------ 94 template <int P_1D, int Q_1D> 95 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 96 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 97 } 98 99 //------------------------------------------------------------------------------ 100 // 2D 101 //------------------------------------------------------------------------------ 102 103 //------------------------------------------------------------------------------ 104 // 2D tensor contraction x 105 //------------------------------------------------------------------------------ 106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 107 inline __device__ void ContractX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 108 CeedScalar *V) { 109 data.slice[t_id_x + t_id_y * T_1D] = *U; 110 __syncthreads(); 111 *V = 0.0; 112 if (t_id_x < Q_1D && t_id_y < P_1D) { 113 for (CeedInt i = 0; i < P_1D; i++) { 114 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 115 } 116 } 117 __syncthreads(); 118 } 119 120 //------------------------------------------------------------------------------ 121 // 2D tensor contract y 122 //------------------------------------------------------------------------------ 123 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 124 inline __device__ void ContractY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 125 CeedScalar *V) { 126 data.slice[t_id_x + t_id_y * T_1D] = *U; 127 __syncthreads(); 128 *V = 0.0; 129 if (t_id_x < Q_1D && t_id_y < Q_1D) { 130 for (CeedInt i = 0; i < P_1D; i++) { 131 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 132 } 133 } 134 __syncthreads(); 135 } 136 137 //------------------------------------------------------------------------------ 138 // 2D transpose tensor contract y 139 //------------------------------------------------------------------------------ 140 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 141 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 142 CeedScalar *V) { 143 data.slice[t_id_x + t_id_y * T_1D] = *U; 144 __syncthreads(); 145 *V = 0.0; 146 if (t_id_x < Q_1D && t_id_y < P_1D) { 147 for (CeedInt i = 0; i < Q_1D; i++) { 148 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 149 } 150 } 151 __syncthreads(); 152 } 153 154 //------------------------------------------------------------------------------ 155 // 2D transpose tensor contract x 156 //------------------------------------------------------------------------------ 157 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 158 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 159 CeedScalar *V) { 160 data.slice[t_id_x + t_id_y * T_1D] = *U; 161 __syncthreads(); 162 *V = 0.0; 163 if (t_id_x < P_1D && t_id_y < P_1D) { 164 for (CeedInt i = 0; i < Q_1D; i++) { 165 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 166 } 167 } 168 __syncthreads(); 169 } 170 171 //------------------------------------------------------------------------------ 172 // 2D transpose tensor contract and add x 173 //------------------------------------------------------------------------------ 174 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 175 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 176 CeedScalar *V) { 177 data.slice[t_id_x + t_id_y * T_1D] = *U; 178 __syncthreads(); 179 if (t_id_x < P_1D && t_id_y < P_1D) { 180 for (CeedInt i = 0; i < Q_1D; i++) { 181 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 182 } 183 } 184 __syncthreads(); 185 } 186 187 //------------------------------------------------------------------------------ 188 // 2D interpolate to quadrature points 189 //------------------------------------------------------------------------------ 190 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 191 inline __device__ void InterpTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 192 const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 193 CeedScalar r_t[1]; 194 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 195 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 196 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 197 } 198 } 199 200 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 201 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 202 CeedScalar *__restrict__ r_V) { 203 InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 204 } 205 206 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 207 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 208 CeedScalar *__restrict__ r_V) { 209 const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 210 211 InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V); 212 } 213 214 //------------------------------------------------------------------------------ 215 // 2D interpolate transpose 216 //------------------------------------------------------------------------------ 217 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 218 inline __device__ void InterpTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 219 const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 220 CeedScalar r_t[1]; 221 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 222 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 223 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 224 } 225 } 226 227 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 228 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 229 CeedScalar *__restrict__ r_V) { 230 InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 231 } 232 233 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 234 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 235 CeedScalar *__restrict__ r_V) { 236 const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 237 238 InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V); 239 } 240 241 //------------------------------------------------------------------------------ 242 // 2D derivatives at quadrature points 243 //------------------------------------------------------------------------------ 244 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 245 inline __device__ void GradTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 246 const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 247 CeedScalar r_t[1]; 248 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 249 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 250 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 251 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 252 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 253 } 254 } 255 256 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 257 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 258 CeedScalar *__restrict__ r_V) { 259 GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 260 } 261 262 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 263 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 264 CeedScalar *__restrict__ r_V) { 265 const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 266 267 GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V); 268 } 269 270 //------------------------------------------------------------------------------ 271 // 2D derivatives transpose 272 //------------------------------------------------------------------------------ 273 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 274 inline __device__ void GradTransposeTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 275 const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 276 CeedScalar r_t[1]; 277 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 278 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 279 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 280 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 281 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 282 } 283 } 284 285 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 286 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 287 CeedScalar *__restrict__ r_V) { 288 GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 289 } 290 291 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 292 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 293 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 294 const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 295 296 GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V); 297 } 298 299 //------------------------------------------------------------------------------ 300 // 2D quadrature weights 301 //------------------------------------------------------------------------------ 302 template <int Q_1D> 303 inline __device__ void WeightTensor2d_Core(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d, 304 CeedScalar *w) { 305 *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 306 } 307 308 template <int P_1D, int Q_1D> 309 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 310 WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w); 311 } 312 313 template <int P_1D, int Q_1D> 314 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 315 const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 316 317 WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w); 318 } 319 320 //------------------------------------------------------------------------------ 321 // 3D 322 //------------------------------------------------------------------------------ 323 324 //------------------------------------------------------------------------------ 325 // 3D tensor contract x 326 //------------------------------------------------------------------------------ 327 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 328 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 329 CeedScalar r_B[P_1D]; 330 for (CeedInt i = 0; i < P_1D; i++) { 331 r_B[i] = B[i + data.t_id_x * P_1D]; 332 } 333 334 for (CeedInt k = 0; k < P_1D; k++) { 335 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 336 __syncthreads(); 337 V[k] = 0.0; 338 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 339 for (CeedInt i = 0; i < P_1D; i++) { 340 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 341 } 342 } 343 __syncthreads(); 344 } 345 } 346 347 //------------------------------------------------------------------------------ 348 // 3D tensor contract y 349 //------------------------------------------------------------------------------ 350 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 351 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 352 CeedScalar r_B[P_1D]; 353 for (CeedInt i = 0; i < P_1D; i++) { 354 r_B[i] = B[i + data.t_id_y * P_1D]; 355 } 356 357 for (CeedInt k = 0; k < P_1D; k++) { 358 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 359 __syncthreads(); 360 V[k] = 0.0; 361 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 362 for (CeedInt i = 0; i < P_1D; i++) { 363 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 364 } 365 } 366 __syncthreads(); 367 } 368 } 369 370 //------------------------------------------------------------------------------ 371 // 3D tensor contract z 372 //------------------------------------------------------------------------------ 373 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 374 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 375 for (CeedInt k = 0; k < Q_1D; k++) { 376 V[k] = 0.0; 377 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 378 for (CeedInt i = 0; i < P_1D; i++) { 379 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 380 } 381 } 382 } 383 } 384 385 //------------------------------------------------------------------------------ 386 // 3D transpose tensor contract z 387 //------------------------------------------------------------------------------ 388 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 389 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 390 for (CeedInt k = 0; k < P_1D; k++) { 391 V[k] = 0.0; 392 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 393 for (CeedInt i = 0; i < Q_1D; i++) { 394 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 395 } 396 } 397 } 398 } 399 400 //------------------------------------------------------------------------------ 401 // 3D transpose tensor contract y 402 //------------------------------------------------------------------------------ 403 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 404 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 405 CeedScalar r_B[Q_1D]; 406 for (CeedInt i = 0; i < Q_1D; i++) { 407 r_B[i] = B[data.t_id_y + i * P_1D]; 408 } 409 410 for (CeedInt k = 0; k < P_1D; k++) { 411 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 412 __syncthreads(); 413 V[k] = 0.0; 414 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 415 for (CeedInt i = 0; i < Q_1D; i++) { 416 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 417 } 418 } 419 __syncthreads(); 420 } 421 } 422 423 //------------------------------------------------------------------------------ 424 // 3D transpose tensor contract y 425 //------------------------------------------------------------------------------ 426 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 427 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 428 CeedScalar r_B[Q_1D]; 429 for (CeedInt i = 0; i < Q_1D; i++) { 430 r_B[i] = B[data.t_id_y + i * P_1D]; 431 } 432 433 for (CeedInt k = 0; k < P_1D; k++) { 434 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 435 __syncthreads(); 436 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 437 for (CeedInt i = 0; i < Q_1D; i++) { 438 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 439 } 440 } 441 __syncthreads(); 442 } 443 } 444 445 //------------------------------------------------------------------------------ 446 // 3D transpose tensor contract x 447 //------------------------------------------------------------------------------ 448 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 449 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 450 CeedScalar r_B[Q_1D]; 451 for (CeedInt i = 0; i < Q_1D; i++) { 452 r_B[i] = B[data.t_id_x + i * P_1D]; 453 } 454 455 for (CeedInt k = 0; k < P_1D; k++) { 456 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 457 __syncthreads(); 458 V[k] = 0.0; 459 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 460 for (CeedInt i = 0; i < Q_1D; i++) { 461 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 462 } 463 } 464 __syncthreads(); 465 } 466 } 467 468 //------------------------------------------------------------------------------ 469 // 3D transpose tensor contract add x 470 //------------------------------------------------------------------------------ 471 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 472 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 473 CeedScalar r_B[Q_1D]; 474 for (CeedInt i = 0; i < Q_1D; i++) { 475 r_B[i] = B[data.t_id_x + i * P_1D]; 476 } 477 478 for (CeedInt k = 0; k < P_1D; k++) { 479 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 480 __syncthreads(); 481 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 482 for (CeedInt i = 0; i < Q_1D; i++) { 483 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 484 } 485 } 486 __syncthreads(); 487 } 488 } 489 490 //------------------------------------------------------------------------------ 491 // 3D interpolate to quadrature points 492 //------------------------------------------------------------------------------ 493 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 494 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 495 CeedScalar *__restrict__ r_V) { 496 CeedScalar r_t1[T_1D]; 497 CeedScalar r_t2[T_1D]; 498 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 499 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 500 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 501 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 502 } 503 } 504 505 //------------------------------------------------------------------------------ 506 // 3D interpolate transpose 507 //------------------------------------------------------------------------------ 508 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 509 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 510 CeedScalar *__restrict__ r_V) { 511 CeedScalar r_t1[T_1D]; 512 CeedScalar r_t2[T_1D]; 513 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 514 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 515 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 516 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 517 } 518 } 519 520 //------------------------------------------------------------------------------ 521 // 3D derivatives at quadrature points 522 //------------------------------------------------------------------------------ 523 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 524 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 525 CeedScalar *__restrict__ r_V) { 526 CeedScalar r_t1[T_1D]; 527 CeedScalar r_t2[T_1D]; 528 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 529 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 530 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 531 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 532 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 533 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 534 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 535 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 536 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 537 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 538 } 539 } 540 541 //------------------------------------------------------------------------------ 542 // 3D derivatives transpose 543 //------------------------------------------------------------------------------ 544 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 545 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 546 CeedScalar *__restrict__ r_V) { 547 CeedScalar r_t1[T_1D]; 548 CeedScalar r_t2[T_1D]; 549 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 550 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 551 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 552 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 553 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 554 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 555 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 556 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 557 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 558 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 559 } 560 } 561 562 //------------------------------------------------------------------------------ 563 // 3D derivatives at quadrature points 564 //------------------------------------------------------------------------------ 565 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 566 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 567 CeedScalar *__restrict__ r_V) { 568 CeedScalar r_t1[T_1D]; 569 CeedScalar r_t2[T_1D]; 570 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 571 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 572 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 573 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 574 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 575 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 576 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 577 } 578 } 579 580 //------------------------------------------------------------------------------ 581 // 3D derivatives transpose 582 //------------------------------------------------------------------------------ 583 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 584 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 585 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 586 CeedScalar r_t1[T_1D]; 587 CeedScalar r_t2[T_1D]; 588 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 589 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 590 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 591 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 592 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 593 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 594 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 595 } 596 } 597 598 //------------------------------------------------------------------------------ 599 // 3D quadrature weights 600 //------------------------------------------------------------------------------ 601 template <int P_1D, int Q_1D> 602 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 603 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 604 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 605 for (CeedInt q = 0; q < Q_1D; q++) { 606 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 607 } 608 } 609