1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 2D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 2D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 20 inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 21 CeedScalar *V) { 22 __syncthreads(); 23 data.slice[t_id_x + t_id_y * T_1D] = *U; 24 __syncthreads(); 25 *V = 0.0; 26 if (t_id_x < Q_1D && t_id_y < P_1D) { 27 for (CeedInt i = 0; i < P_1D; i++) { 28 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 29 } 30 } 31 } 32 33 //------------------------------------------------------------------------------ 34 // 2D tensor contract y 35 //------------------------------------------------------------------------------ 36 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 37 inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 38 CeedScalar *V) { 39 __syncthreads(); 40 data.slice[t_id_x + t_id_y * T_1D] = *U; 41 __syncthreads(); 42 *V = 0.0; 43 if (t_id_x < Q_1D && t_id_y < Q_1D) { 44 for (CeedInt i = 0; i < P_1D; i++) { 45 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 46 } 47 } 48 } 49 50 //------------------------------------------------------------------------------ 51 // 2D transpose tensor contract y 52 //------------------------------------------------------------------------------ 53 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 54 inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 55 const CeedScalar *B, CeedScalar *V) { 56 __syncthreads(); 57 data.slice[t_id_x + t_id_y * T_1D] = *U; 58 __syncthreads(); 59 *V = 0.0; 60 if (t_id_x < Q_1D && t_id_y < P_1D) { 61 for (CeedInt i = 0; i < Q_1D; i++) { 62 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 63 } 64 } 65 } 66 67 //------------------------------------------------------------------------------ 68 // 2D transpose tensor contract x 69 //------------------------------------------------------------------------------ 70 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 71 inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 72 const CeedScalar *B, CeedScalar *V) { 73 __syncthreads(); 74 data.slice[t_id_x + t_id_y * T_1D] = *U; 75 __syncthreads(); 76 *V = 0.0; 77 if (t_id_x < P_1D && t_id_y < P_1D) { 78 for (CeedInt i = 0; i < Q_1D; i++) { 79 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 80 } 81 } 82 } 83 84 //------------------------------------------------------------------------------ 85 // 2D transpose tensor contract and add x 86 //------------------------------------------------------------------------------ 87 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 88 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 89 const CeedScalar *B, CeedScalar *V) { 90 __syncthreads(); 91 data.slice[t_id_x + t_id_y * T_1D] = *U; 92 __syncthreads(); 93 if (t_id_x < P_1D && t_id_y < P_1D) { 94 for (CeedInt i = 0; i < Q_1D; i++) { 95 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 96 } 97 } 98 } 99 100 //------------------------------------------------------------------------------ 101 // 2D pack/unpack quadrature values 102 //------------------------------------------------------------------------------ 103 template <int NUM_COMP, int Q_1D, int T_1D> 104 inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 105 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 106 107 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108 __syncthreads(); 109 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 110 __syncthreads(); 111 U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 112 } 113 } 114 115 template <int NUM_COMP, int Q_1D, int T_1D> 116 inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 117 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 118 119 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120 __syncthreads(); 121 if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 122 __syncthreads(); 123 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 124 } 125 } 126 127 //------------------------------------------------------------------------------ 128 // 2D interpolate to quadrature points 129 //------------------------------------------------------------------------------ 130 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 131 inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 132 CeedScalar *__restrict__ r_V) { 133 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 134 CeedScalar r_t[1]; 135 136 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 137 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 139 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 140 } 141 __syncthreads(); 142 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 143 if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 144 } 145 146 //------------------------------------------------------------------------------ 147 // 2D interpolate transpose 148 //------------------------------------------------------------------------------ 149 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 150 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 151 CeedScalar *__restrict__ r_V) { 152 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 153 CeedScalar r_t[1]; 154 155 if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 156 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 157 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 158 ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 159 } 160 __syncthreads(); 161 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 162 } 163 164 //------------------------------------------------------------------------------ 165 // 2D interpolate to quadrature points, nodes and quadrature points collocated 166 //------------------------------------------------------------------------------ 167 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 168 inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 169 CeedScalar *__restrict__ r_V) { 170 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 171 172 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 173 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 174 r_V[comp] = r_U[comp]; 175 } 176 __syncthreads(); 177 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 178 if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 179 } 180 181 //------------------------------------------------------------------------------ 182 // 2D interpolate transpose, nodes and quadrature points collocated 183 //------------------------------------------------------------------------------ 184 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 185 inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 186 CeedScalar *__restrict__ r_V) { 187 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 188 189 if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 190 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 191 r_V[comp] = r_U[comp]; 192 } 193 __syncthreads(); 194 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 195 } 196 197 //------------------------------------------------------------------------------ 198 // 2D derivatives at quadrature points 199 //------------------------------------------------------------------------------ 200 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 201 inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 202 CeedScalar *__restrict__ r_V) { 203 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 204 CeedScalar r_t[1]; 205 206 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 207 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 208 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 209 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 210 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 211 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 212 } 213 __syncthreads(); 214 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 215 if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 216 } 217 218 //------------------------------------------------------------------------------ 219 // 2D derivatives transpose 220 //------------------------------------------------------------------------------ 221 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 222 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 223 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 224 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 225 CeedScalar r_t[1]; 226 227 if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 228 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 229 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 230 ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 231 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 232 ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 233 } 234 __syncthreads(); 235 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 236 } 237 238 //------------------------------------------------------------------------------ 239 // 2D derivatives at quadrature points, nodes and quadrature points collocated 240 //------------------------------------------------------------------------------ 241 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 242 inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 243 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 244 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 245 246 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 247 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 248 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); 249 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); 250 } 251 __syncthreads(); 252 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 253 if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 254 } 255 256 //------------------------------------------------------------------------------ 257 // 2D derivatives transpose, nodes and quadrature points collocated 258 //------------------------------------------------------------------------------ 259 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 260 inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 261 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 262 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 263 264 if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 265 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 266 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); 267 ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); 268 } 269 __syncthreads(); 270 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 271 } 272 273 //------------------------------------------------------------------------------ 274 // 2D quadrature weights 275 //------------------------------------------------------------------------------ 276 template <int P_1D, int Q_1D> 277 inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 278 const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 279 280 *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 281 } 282 283 //------------------------------------------------------------------------------ 284 // 3D 285 //------------------------------------------------------------------------------ 286 287 //------------------------------------------------------------------------------ 288 // 3D tensor contract x 289 //------------------------------------------------------------------------------ 290 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 291 inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 292 const CeedScalar *B, CeedScalar *V) { 293 __syncthreads(); 294 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 295 __syncthreads(); 296 *V = 0.0; 297 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 298 for (CeedInt i = 0; i < P_1D; i++) { 299 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 300 } 301 } 302 } 303 304 //------------------------------------------------------------------------------ 305 // 3D tensor contract y 306 //------------------------------------------------------------------------------ 307 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 308 inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 309 const CeedScalar *B, CeedScalar *V) { 310 __syncthreads(); 311 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 312 __syncthreads(); 313 *V = 0.0; 314 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 315 for (CeedInt i = 0; i < P_1D; i++) { 316 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 317 } 318 } 319 } 320 321 //------------------------------------------------------------------------------ 322 // 3D tensor contract z 323 //------------------------------------------------------------------------------ 324 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 325 inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 326 const CeedScalar *B, CeedScalar *V) { 327 __syncthreads(); 328 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 329 __syncthreads(); 330 *V = 0.0; 331 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 332 for (CeedInt i = 0; i < P_1D; i++) { 333 *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 334 } 335 } 336 } 337 338 //------------------------------------------------------------------------------ 339 // 3D tensor contract z 340 //------------------------------------------------------------------------------ 341 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 342 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 343 const CeedScalar *B, CeedScalar *V) { 344 __syncthreads(); 345 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 346 __syncthreads(); 347 *V = 0.0; 348 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 349 for (CeedInt i = 0; i < Q_1D; i++) { 350 *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 351 } 352 } 353 } 354 355 //------------------------------------------------------------------------------ 356 // 3D transpose tensor contract z 357 //------------------------------------------------------------------------------ 358 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 359 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 360 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 361 __syncthreads(); 362 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 363 __syncthreads(); 364 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 365 for (CeedInt i = 0; i < Q_1D; i++) { 366 *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 367 } 368 } 369 } 370 371 //------------------------------------------------------------------------------ 372 // 3D transpose tensor contract y 373 //------------------------------------------------------------------------------ 374 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 375 inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 376 const CeedScalar *B, CeedScalar *V) { 377 __syncthreads(); 378 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 379 __syncthreads(); 380 *V = 0.0; 381 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 382 for (CeedInt i = 0; i < Q_1D; i++) { 383 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 384 } 385 } 386 } 387 388 //------------------------------------------------------------------------------ 389 // 3D transpose tensor contract y 390 //------------------------------------------------------------------------------ 391 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 392 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 393 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 394 __syncthreads(); 395 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 396 __syncthreads(); 397 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 398 for (CeedInt i = 0; i < Q_1D; i++) { 399 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 400 } 401 } 402 } 403 404 //------------------------------------------------------------------------------ 405 // 3D transpose tensor contract x 406 //------------------------------------------------------------------------------ 407 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 408 inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 409 const CeedScalar *B, CeedScalar *V) { 410 __syncthreads(); 411 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 412 __syncthreads(); 413 *V = 0.0; 414 if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 415 for (CeedInt i = 0; i < Q_1D; i++) { 416 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 417 } 418 } 419 } 420 421 //------------------------------------------------------------------------------ 422 // 3D transpose tensor contract add x 423 //------------------------------------------------------------------------------ 424 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 425 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, 426 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 427 __syncthreads(); 428 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 429 __syncthreads(); 430 if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 431 for (CeedInt i = 0; i < Q_1D; i++) { 432 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 433 } 434 } 435 } 436 437 //------------------------------------------------------------------------------ 438 // 3D pack/unpack quadrature values 439 //------------------------------------------------------------------------------ 440 template <int NUM_COMP, int Q_1D, int T_1D> 441 inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 442 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 443 444 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 445 __syncthreads(); 446 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 447 __syncthreads(); 448 U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 449 } 450 } 451 452 template <int NUM_COMP, int Q_1D, int T_1D> 453 inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 454 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 455 456 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 457 __syncthreads(); 458 if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 459 __syncthreads(); 460 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 461 } 462 } 463 464 //------------------------------------------------------------------------------ 465 // 3D interpolate to quadrature points 466 //------------------------------------------------------------------------------ 467 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 468 inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 469 CeedScalar *__restrict__ r_V) { 470 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 471 CeedScalar r_t1[1], r_t2[1]; 472 473 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 474 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 475 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 476 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 477 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 478 } 479 __syncthreads(); 480 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 481 if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 482 } 483 484 //------------------------------------------------------------------------------ 485 // 3D interpolate transpose 486 //------------------------------------------------------------------------------ 487 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 488 inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 489 CeedScalar *__restrict__ r_V) { 490 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 491 CeedScalar r_t1[1], r_t2[1]; 492 493 if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 494 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 495 ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 496 ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 497 ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 498 } 499 __syncthreads(); 500 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 501 } 502 503 //------------------------------------------------------------------------------ 504 // 3D interpolate to quadrature points, nodes and quadrature points collocated 505 //------------------------------------------------------------------------------ 506 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 507 inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 508 CeedScalar *__restrict__ r_V) { 509 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 510 511 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 512 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 513 r_V[comp] = r_U[comp]; 514 } 515 __syncthreads(); 516 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 517 if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 518 } 519 520 //------------------------------------------------------------------------------ 521 // 3D interpolate transpose, nodes and quadrature points collocated 522 //------------------------------------------------------------------------------ 523 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 524 inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 525 CeedScalar *__restrict__ r_V) { 526 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 527 528 if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 529 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 530 r_V[comp] = r_U[comp]; 531 } 532 __syncthreads(); 533 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 534 } 535 536 //------------------------------------------------------------------------------ 537 // 3D derivatives at quadrature points 538 //------------------------------------------------------------------------------ 539 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 540 inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 541 CeedScalar *__restrict__ r_V) { 542 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 543 CeedScalar r_t1[1], r_t2[1]; 544 545 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 546 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 547 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 548 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 549 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 550 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 551 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 552 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 553 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 554 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 555 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 556 } 557 __syncthreads(); 558 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 559 if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 560 } 561 562 //------------------------------------------------------------------------------ 563 // 3D derivatives transpose 564 //------------------------------------------------------------------------------ 565 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 566 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 567 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 568 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 569 CeedScalar r_t1[1], r_t2[1]; 570 571 if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 572 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 573 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 574 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 575 ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 576 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 577 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 578 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 579 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 580 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 581 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 582 } 583 __syncthreads(); 584 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 585 } 586 587 //------------------------------------------------------------------------------ 588 // 3D derivatives at quadrature points 589 //------------------------------------------------------------------------------ 590 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 591 inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 592 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 593 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 594 CeedScalar r_t1[1], r_t2[1]; 595 596 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 597 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 598 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 599 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 600 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 601 ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 602 ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 603 ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 604 } 605 __syncthreads(); 606 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 607 if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 608 } 609 610 //------------------------------------------------------------------------------ 611 // 3D derivatives transpose 612 //------------------------------------------------------------------------------ 613 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 614 inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 615 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 616 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 617 CeedScalar r_t1[1], r_t2[1]; 618 619 if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 620 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 621 ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); 622 ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); 623 ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); 624 ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 625 ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 626 ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 627 } 628 __syncthreads(); 629 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 630 } 631 632 //------------------------------------------------------------------------------ 633 // 3D derivatives at quadrature points, nodes and quadrature points collocated 634 //------------------------------------------------------------------------------ 635 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 636 inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 637 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 638 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 639 640 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 641 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 642 ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]); 643 ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]); 644 ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]); 645 } 646 __syncthreads(); 647 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 648 if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 649 } 650 651 //------------------------------------------------------------------------------ 652 // 3D derivatives transpose, nodes and quadrature points collocated 653 //------------------------------------------------------------------------------ 654 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 655 inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 656 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 657 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 658 659 if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 660 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 661 ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]); 662 ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]); 663 ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]); 664 } 665 __syncthreads(); 666 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 667 } 668 669 //------------------------------------------------------------------------------ 670 // 3D quadrature weights 671 //------------------------------------------------------------------------------ 672 template <int P_1D, int Q_1D> 673 inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 674 const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 675 676 *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 677 } 678