1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 1D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 1D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D> 20 inline __device__ void ContractX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 21 data.slice[data.t_id_x] = *U; 22 __syncthreads(); 23 *V = 0.0; 24 if (data.t_id_x < Q_1D) { 25 for (CeedInt i = 0; i < P_1D; i++) { 26 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction 27 } 28 } 29 __syncthreads(); 30 } 31 32 //------------------------------------------------------------------------------ 33 // 1D transpose tensor contraction x 34 //------------------------------------------------------------------------------ 35 template <int NUM_COMP, int P_1D, int Q_1D> 36 inline __device__ void ContractTransposeX1d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 37 data.slice[data.t_id_x] = *U; 38 __syncthreads(); 39 *V = 0.0; 40 if (data.t_id_x < P_1D) { 41 for (CeedInt i = 0; i < Q_1D; i++) { 42 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction 43 } 44 } 45 __syncthreads(); 46 } 47 48 //------------------------------------------------------------------------------ 49 // 1D interpolate to quadrature points 50 //------------------------------------------------------------------------------ 51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 52 inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 55 } 56 } 57 58 //------------------------------------------------------------------------------ 59 // 1D interpolate transpose 60 //------------------------------------------------------------------------------ 61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 62 inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 63 CeedScalar *__restrict__ r_V) { 64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // 1D derivatives at quadrature points 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 73 inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 74 CeedScalar *__restrict__ r_V) { 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 76 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 77 } 78 } 79 80 //------------------------------------------------------------------------------ 81 // 1D derivatives transpose 82 //------------------------------------------------------------------------------ 83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 84 inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 85 CeedScalar *__restrict__ r_V) { 86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 87 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]); 88 } 89 } 90 91 //------------------------------------------------------------------------------ 92 // 1D quadrature weights 93 //------------------------------------------------------------------------------ 94 template <int Q_1D> 95 inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 96 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; 97 } 98 99 //------------------------------------------------------------------------------ 100 // 2D 101 //------------------------------------------------------------------------------ 102 103 //------------------------------------------------------------------------------ 104 // 2D tensor contraction x 105 //------------------------------------------------------------------------------ 106 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 107 inline __device__ void ContractX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 108 CeedScalar *V) { 109 data.slice[t_id_x + t_id_y * T_1D] = *U; 110 __syncthreads(); 111 *V = 0.0; 112 if (t_id_x < Q_1D && t_id_y < P_1D) { 113 for (CeedInt i = 0; i < P_1D; i++) { 114 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 115 } 116 } 117 __syncthreads(); 118 } 119 120 //------------------------------------------------------------------------------ 121 // 2D tensor contract y 122 //------------------------------------------------------------------------------ 123 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 124 inline __device__ void ContractY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 125 CeedScalar *V) { 126 data.slice[t_id_x + t_id_y * T_1D] = *U; 127 >>>>>>> b855402d (gpu - isolate core 2D tensor logic to allow flat version) 128 __syncthreads(); 129 *V = 0.0; 130 if (t_id_x < Q_1D && t_id_y < Q_1D) { 131 for (CeedInt i = 0; i < P_1D; i++) { 132 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 133 } 134 } 135 __syncthreads(); 136 } 137 138 //------------------------------------------------------------------------------ 139 // 2D transpose tensor contract y 140 //------------------------------------------------------------------------------ 141 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 142 inline __device__ void ContractTransposeY2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 143 CeedScalar *V) { 144 data.slice[t_id_x + t_id_y * T_1D] = *U; 145 __syncthreads(); 146 *V = 0.0; 147 if (t_id_x < Q_1D && t_id_y < P_1D) { 148 for (CeedInt i = 0; i < Q_1D; i++) { 149 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 150 } 151 } 152 __syncthreads(); 153 } 154 155 //------------------------------------------------------------------------------ 156 // 2D transpose tensor contract x 157 //------------------------------------------------------------------------------ 158 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 159 inline __device__ void ContractTransposeX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 160 CeedScalar *V) { 161 data.slice[t_id_x + t_id_y * T_1D] = *U; 162 __syncthreads(); 163 *V = 0.0; 164 if (t_id_x < P_1D && t_id_y < P_1D) { 165 for (CeedInt i = 0; i < Q_1D; i++) { 166 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 167 } 168 } 169 __syncthreads(); 170 } 171 172 //------------------------------------------------------------------------------ 173 // 2D transpose tensor contract and add x 174 //------------------------------------------------------------------------------ 175 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 176 inline __device__ void ContractTransposeAddX2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 177 CeedScalar *V) { 178 data.slice[t_id_x + t_id_y * T_1D] = *U; 179 __syncthreads(); 180 if (t_id_x < P_1D && t_id_y < P_1D) { 181 for (CeedInt i = 0; i < Q_1D; i++) { 182 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 183 } 184 } 185 __syncthreads(); 186 } 187 188 //------------------------------------------------------------------------------ 189 // 2D interpolate to quadrature points 190 //------------------------------------------------------------------------------ 191 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 192 inline __device__ void InterpTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 193 const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 194 CeedScalar r_t[1]; 195 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 196 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 197 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 198 } 199 } 200 201 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 202 inline __device__ void InterpTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 203 InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 204 } 205 206 template <int NUM_COMP, int P_1D, int Q_1D> 207 inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 208 CeedScalar *__restrict__ r_V) { 209 const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 210 211 InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V); 212 } 213 214 //------------------------------------------------------------------------------ 215 // 2D interpolate transpose 216 //------------------------------------------------------------------------------ 217 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 218 inline __device__ void InterpTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 219 const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 220 CeedScalar r_t[1]; 221 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 222 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 223 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 224 } 225 } 226 227 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 228 inline __device__ void InterpTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 229 CeedScalar *__restrict__ r_V) { 230 InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, r_V); 231 } 232 233 template <int NUM_COMP, int P_1D, int Q_1D> 234 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 235 CeedScalar *__restrict__ r_V) { 236 const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 237 238 InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V); 239 } 240 241 //------------------------------------------------------------------------------ 242 // 2D derivatives at quadrature points 243 //------------------------------------------------------------------------------ 244 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 245 inline __device__ void GradTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 246 const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 247 CeedScalar r_t[1]; 248 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 249 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 250 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 251 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 252 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 253 } 254 } 255 256 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 257 inline __device__ void GradTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 258 CeedScalar *__restrict__ r_V) { 259 GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 260 } 261 262 template <int NUM_COMP, int P_1D, int Q_1D> 263 inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 264 CeedScalar *__restrict__ r_V) { 265 const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 266 267 GradTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V); 268 } 269 270 //------------------------------------------------------------------------------ 271 // 2D derivatives transpose 272 //------------------------------------------------------------------------------ 273 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 274 inline __device__ void GradTransposeTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ r_U, 275 const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 276 CeedScalar r_t[1]; 277 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 278 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 279 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 280 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 281 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 282 } 283 } 284 285 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 286 inline __device__ void GradTransposeTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 287 CeedScalar *__restrict__ r_V) { 288 GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x, data.t_id_y, r_U, c_B, c_G, r_V); 289 } 290 291 template <int NUM_COMP, int P_1D, int Q_1D> 292 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 293 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 294 const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 295 296 GradTansposeTensor2d_Core<NUM_COMP, P_1D, Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V); 297 } 298 299 //------------------------------------------------------------------------------ 300 // 2D quadrature weights 301 //------------------------------------------------------------------------------ 302 template <int Q_1D> 303 inline __device__ void WeightTensor2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *__restrict__ q_weight_1d, 304 CeedScalar *w) { 305 *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 306 } 307 308 template <int P_1D, int Q_1D> 309 inline __device__ void WeightTensor2d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 310 WeightTensor2d_Core<Q_1D>(data, data.t_id_x, data.t_id_y, q_weight_1d, w); 311 } 312 313 template <int P_1D, int Q_1D> 314 inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 315 const int max_1d = P_1D < Q_1D ? P_1D : Q_1D; 316 317 WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w); 318 } 319 320 //------------------------------------------------------------------------------ 321 // 3D 322 //------------------------------------------------------------------------------ 323 324 //------------------------------------------------------------------------------ 325 // 3D tensor contract x 326 //------------------------------------------------------------------------------ 327 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 328 inline __device__ void ContractX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 329 CeedScalar r_B[P_1D]; 330 for (CeedInt i = 0; i < P_1D; i++) { 331 r_B[i] = B[i + data.t_id_x * P_1D]; 332 } 333 334 for (CeedInt k = 0; k < P_1D; k++) { 335 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 336 __syncthreads(); 337 V[k] = 0.0; 338 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 339 for (CeedInt i = 0; i < P_1D; i++) { 340 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 341 } 342 } 343 __syncthreads(); 344 } 345 } 346 347 //------------------------------------------------------------------------------ 348 // 3D tensor contract y 349 //------------------------------------------------------------------------------ 350 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 351 inline __device__ void ContractY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 352 CeedScalar r_B[P_1D]; 353 for (CeedInt i = 0; i < P_1D; i++) { 354 r_B[i] = B[i + data.t_id_y * P_1D]; 355 } 356 357 for (CeedInt k = 0; k < P_1D; k++) { 358 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 359 __syncthreads(); 360 V[k] = 0.0; 361 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 362 for (CeedInt i = 0; i < P_1D; i++) { 363 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 364 } 365 } 366 __syncthreads(); 367 } 368 } 369 370 //------------------------------------------------------------------------------ 371 // 3D tensor contract z 372 //------------------------------------------------------------------------------ 373 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 374 inline __device__ void ContractZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 375 for (CeedInt k = 0; k < Q_1D; k++) { 376 V[k] = 0.0; 377 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 378 for (CeedInt i = 0; i < P_1D; i++) { 379 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction 380 } 381 } 382 } 383 } 384 385 //------------------------------------------------------------------------------ 386 // 3D transpose tensor contract z 387 //------------------------------------------------------------------------------ 388 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 389 inline __device__ void ContractTransposeZ3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 390 for (CeedInt k = 0; k < P_1D; k++) { 391 V[k] = 0.0; 392 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 393 for (CeedInt i = 0; i < Q_1D; i++) { 394 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction 395 } 396 } 397 } 398 } 399 400 //------------------------------------------------------------------------------ 401 // 3D transpose tensor contract y 402 //------------------------------------------------------------------------------ 403 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 404 inline __device__ void ContractTransposeY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 405 CeedScalar r_B[Q_1D]; 406 for (CeedInt i = 0; i < Q_1D; i++) { 407 r_B[i] = B[data.t_id_y + i * P_1D]; 408 } 409 410 for (CeedInt k = 0; k < P_1D; k++) { 411 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 412 __syncthreads(); 413 V[k] = 0.0; 414 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 415 for (CeedInt i = 0; i < Q_1D; i++) { 416 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 417 } 418 } 419 __syncthreads(); 420 } 421 } 422 423 //------------------------------------------------------------------------------ 424 // 3D transpose tensor contract y 425 //------------------------------------------------------------------------------ 426 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 427 inline __device__ void ContractTransposeAddY3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 428 CeedScalar r_B[Q_1D]; 429 for (CeedInt i = 0; i < Q_1D; i++) { 430 r_B[i] = B[data.t_id_y + i * P_1D]; 431 } 432 433 for (CeedInt k = 0; k < P_1D; k++) { 434 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 435 __syncthreads(); 436 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) { 437 for (CeedInt i = 0; i < Q_1D; i++) { 438 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction 439 } 440 } 441 __syncthreads(); 442 } 443 } 444 445 //------------------------------------------------------------------------------ 446 // 3D transpose tensor contract x 447 //------------------------------------------------------------------------------ 448 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 449 inline __device__ void ContractTransposeX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 450 CeedScalar r_B[Q_1D]; 451 for (CeedInt i = 0; i < Q_1D; i++) { 452 r_B[i] = B[data.t_id_x + i * P_1D]; 453 } 454 455 for (CeedInt k = 0; k < P_1D; k++) { 456 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 457 __syncthreads(); 458 V[k] = 0.0; 459 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 460 for (CeedInt i = 0; i < Q_1D; i++) { 461 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 462 } 463 } 464 __syncthreads(); 465 } 466 } 467 468 //------------------------------------------------------------------------------ 469 // 3D transpose tensor contract add x 470 //------------------------------------------------------------------------------ 471 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 472 inline __device__ void ContractTransposeAddX3d(SharedData_Hip &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 473 CeedScalar r_B[Q_1D]; 474 for (CeedInt i = 0; i < Q_1D; i++) { 475 r_B[i] = B[data.t_id_x + i * P_1D]; 476 } 477 478 for (CeedInt k = 0; k < P_1D; k++) { 479 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k]; 480 __syncthreads(); 481 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 482 for (CeedInt i = 0; i < Q_1D; i++) { 483 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction 484 } 485 } 486 __syncthreads(); 487 } 488 } 489 490 //------------------------------------------------------------------------------ 491 // 3D interpolate to quadrature points 492 //------------------------------------------------------------------------------ 493 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 494 inline __device__ void InterpTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { 495 CeedScalar r_t1[T_1D]; 496 CeedScalar r_t2[T_1D]; 497 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 498 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 499 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 500 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]); 501 } 502 } 503 504 //------------------------------------------------------------------------------ 505 // 3D interpolate transpose 506 //------------------------------------------------------------------------------ 507 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 508 inline __device__ void InterpTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 509 CeedScalar *__restrict__ r_V) { 510 CeedScalar r_t1[T_1D]; 511 CeedScalar r_t2[T_1D]; 512 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 513 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1); 514 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 515 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 516 } 517 } 518 519 //------------------------------------------------------------------------------ 520 // 3D derivatives at quadrature points 521 //------------------------------------------------------------------------------ 522 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 523 inline __device__ void GradTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 524 CeedScalar *__restrict__ r_V) { 525 CeedScalar r_t1[T_1D]; 526 CeedScalar r_t2[T_1D]; 527 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 528 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1); 529 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 530 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 531 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 532 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 533 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 534 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 535 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 536 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 537 } 538 } 539 540 //------------------------------------------------------------------------------ 541 // 3D derivatives transpose 542 //------------------------------------------------------------------------------ 543 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 544 inline __device__ void GradTransposeTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 545 CeedScalar *__restrict__ r_V) { 546 CeedScalar r_t1[T_1D]; 547 CeedScalar r_t2[T_1D]; 548 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 549 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1); 550 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 551 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]); 552 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1); 553 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 554 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 555 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1); 556 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 557 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 558 } 559 } 560 561 //------------------------------------------------------------------------------ 562 // 3D derivatives at quadrature points 563 //------------------------------------------------------------------------------ 564 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 565 inline __device__ void GradTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 566 CeedScalar *__restrict__ r_V) { 567 CeedScalar r_t1[T_1D]; 568 CeedScalar r_t2[T_1D]; 569 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 570 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1); 571 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 572 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 573 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]); 574 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]); 575 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]); 576 } 577 } 578 579 //------------------------------------------------------------------------------ 580 // 3D derivatives transpose 581 //------------------------------------------------------------------------------ 582 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 583 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 584 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 585 CeedScalar r_t1[T_1D]; 586 CeedScalar r_t2[T_1D]; 587 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 588 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2); 589 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2); 590 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2); 591 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1); 592 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 593 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]); 594 } 595 } 596 597 //------------------------------------------------------------------------------ 598 // 3D quadrature weights 599 //------------------------------------------------------------------------------ 600 template <int Q_1D> 601 inline __device__ void WeightTensor3d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 602 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); 603 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; 604 for (CeedInt q = 0; q < Q_1D; q++) { 605 w[q] = quad ? pw * q_weight_1d[q] : 0.0; 606 } 607 } 608