1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 2D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 2D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 20 inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 21 CeedScalar *V) { 22 data.slice[t_id_x + t_id_y * T_1D] = *U; 23 __syncthreads(); 24 *V = 0.0; 25 if (t_id_x < Q_1D && t_id_y < P_1D) { 26 for (CeedInt i = 0; i < P_1D; i++) { 27 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 28 } 29 } 30 __syncthreads(); 31 } 32 33 //------------------------------------------------------------------------------ 34 // 2D tensor contract y 35 //------------------------------------------------------------------------------ 36 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 37 inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 38 CeedScalar *V) { 39 data.slice[t_id_x + t_id_y * T_1D] = *U; 40 __syncthreads(); 41 *V = 0.0; 42 if (t_id_x < Q_1D && t_id_y < Q_1D) { 43 for (CeedInt i = 0; i < P_1D; i++) { 44 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 45 } 46 } 47 __syncthreads(); 48 } 49 50 //------------------------------------------------------------------------------ 51 // 2D transpose tensor contract y 52 //------------------------------------------------------------------------------ 53 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 54 inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 55 const CeedScalar *B, CeedScalar *V) { 56 data.slice[t_id_x + t_id_y * T_1D] = *U; 57 __syncthreads(); 58 *V = 0.0; 59 if (t_id_x < Q_1D && t_id_y < P_1D) { 60 for (CeedInt i = 0; i < Q_1D; i++) { 61 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 62 } 63 } 64 __syncthreads(); 65 } 66 67 //------------------------------------------------------------------------------ 68 // 2D transpose tensor contract x 69 //------------------------------------------------------------------------------ 70 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 71 inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 72 const CeedScalar *B, CeedScalar *V) { 73 data.slice[t_id_x + t_id_y * T_1D] = *U; 74 __syncthreads(); 75 *V = 0.0; 76 if (t_id_x < P_1D && t_id_y < P_1D) { 77 for (CeedInt i = 0; i < Q_1D; i++) { 78 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 79 } 80 } 81 __syncthreads(); 82 } 83 84 //------------------------------------------------------------------------------ 85 // 2D transpose tensor contract and add x 86 //------------------------------------------------------------------------------ 87 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 88 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 89 const CeedScalar *B, CeedScalar *V) { 90 data.slice[t_id_x + t_id_y * T_1D] = *U; 91 __syncthreads(); 92 if (t_id_x < P_1D && t_id_y < P_1D) { 93 for (CeedInt i = 0; i < Q_1D; i++) { 94 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 95 } 96 } 97 __syncthreads(); 98 } 99 100 //------------------------------------------------------------------------------ 101 // 2D pack/unpack quadrature values 102 //------------------------------------------------------------------------------ 103 template <int NUM_COMP, int Q_1D, int T_1D> 104 inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 105 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 106 107 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 109 __syncthreads(); 110 U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 111 __syncthreads(); 112 } 113 } 114 115 template <int NUM_COMP, int Q_1D, int T_1D> 116 inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 117 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 118 119 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120 if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 121 __syncthreads(); 122 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 123 __syncthreads(); 124 } 125 } 126 127 //------------------------------------------------------------------------------ 128 // 2D interpolate to quadrature points 129 //------------------------------------------------------------------------------ 130 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 131 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 132 CeedScalar *__restrict__ r_V) { 133 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 134 CeedScalar r_t[1]; 135 136 QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 137 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 139 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 140 } 141 QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 142 } 143 144 //------------------------------------------------------------------------------ 145 // 2D interpolate transpose 146 //------------------------------------------------------------------------------ 147 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 148 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 149 CeedScalar *__restrict__ r_V) { 150 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 151 CeedScalar r_t[1]; 152 153 QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 154 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 155 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 156 ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 157 } 158 QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 159 } 160 161 //------------------------------------------------------------------------------ 162 // 2D derivatives at quadrature points 163 //------------------------------------------------------------------------------ 164 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 165 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 166 CeedScalar *__restrict__ r_V) { 167 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 168 CeedScalar r_t[1]; 169 170 QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 171 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 172 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 173 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 174 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 175 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 176 } 177 QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 178 } 179 180 //------------------------------------------------------------------------------ 181 // 2D derivatives transpose 182 //------------------------------------------------------------------------------ 183 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 184 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 185 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 186 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 187 CeedScalar r_t[1]; 188 189 QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 190 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 191 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 192 ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 193 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 194 ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 195 } 196 QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 197 } 198 199 //------------------------------------------------------------------------------ 200 // 2D quadrature weights 201 //------------------------------------------------------------------------------ 202 template <int P_1D, int Q_1D> 203 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 204 const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 205 206 *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 207 } 208 209 //------------------------------------------------------------------------------ 210 // 3D 211 //------------------------------------------------------------------------------ 212 213 //------------------------------------------------------------------------------ 214 // 3D tensor contract x 215 //------------------------------------------------------------------------------ 216 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 217 inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 218 const CeedScalar *B, CeedScalar *V) { 219 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 220 __syncthreads(); 221 *V = 0.0; 222 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 223 for (CeedInt i = 0; i < P_1D; i++) { 224 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 225 } 226 } 227 __syncthreads(); 228 } 229 230 //------------------------------------------------------------------------------ 231 // 3D tensor contract y 232 //------------------------------------------------------------------------------ 233 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 234 inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 235 const CeedScalar *B, CeedScalar *V) { 236 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 237 __syncthreads(); 238 *V = 0.0; 239 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 240 for (CeedInt i = 0; i < P_1D; i++) { 241 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 242 } 243 } 244 __syncthreads(); 245 } 246 247 //------------------------------------------------------------------------------ 248 // 3D tensor contract z 249 //------------------------------------------------------------------------------ 250 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 251 inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 252 const CeedScalar *B, CeedScalar *V) { 253 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 254 __syncthreads(); 255 *V = 0.0; 256 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 257 for (CeedInt i = 0; i < P_1D; i++) { 258 *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 259 } 260 } 261 __syncthreads(); 262 } 263 264 //------------------------------------------------------------------------------ 265 // 3D tensor contract z 266 //------------------------------------------------------------------------------ 267 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 268 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 269 const CeedScalar *B, CeedScalar *V) { 270 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 271 __syncthreads(); 272 *V = 0.0; 273 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 274 for (CeedInt i = 0; i < Q_1D; i++) { 275 *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 276 } 277 } 278 __syncthreads(); 279 } 280 281 //------------------------------------------------------------------------------ 282 // 3D transpose tensor contract z 283 //------------------------------------------------------------------------------ 284 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 285 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 286 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 287 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 288 __syncthreads(); 289 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 290 for (CeedInt i = 0; i < Q_1D; i++) { 291 *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 292 } 293 } 294 __syncthreads(); 295 } 296 297 //------------------------------------------------------------------------------ 298 // 3D transpose tensor contract y 299 //------------------------------------------------------------------------------ 300 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 301 inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 302 const CeedScalar *B, CeedScalar *V) { 303 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 304 __syncthreads(); 305 *V = 0.0; 306 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 307 for (CeedInt i = 0; i < Q_1D; i++) { 308 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 309 } 310 } 311 __syncthreads(); 312 } 313 314 //------------------------------------------------------------------------------ 315 // 3D transpose tensor contract y 316 //------------------------------------------------------------------------------ 317 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 318 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 319 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 320 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 321 __syncthreads(); 322 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 323 for (CeedInt i = 0; i < Q_1D; i++) { 324 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 325 } 326 } 327 __syncthreads(); 328 } 329 330 //------------------------------------------------------------------------------ 331 // 3D transpose tensor contract x 332 //------------------------------------------------------------------------------ 333 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 334 inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 335 const CeedScalar *B, CeedScalar *V) { 336 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 337 __syncthreads(); 338 *V = 0.0; 339 if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 340 for (CeedInt i = 0; i < Q_1D; i++) { 341 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 342 } 343 } 344 __syncthreads(); 345 } 346 347 //------------------------------------------------------------------------------ 348 // 3D transpose tensor contract add x 349 //------------------------------------------------------------------------------ 350 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 351 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 352 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 353 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 354 __syncthreads(); 355 if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 356 for (CeedInt i = 0; i < Q_1D; i++) { 357 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 358 } 359 } 360 __syncthreads(); 361 } 362 363 //------------------------------------------------------------------------------ 364 // 3D pack/unpack quadrature values 365 //------------------------------------------------------------------------------ 366 template <int NUM_COMP, int Q_1D, int T_1D> 367 inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 368 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 369 370 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 371 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 372 __syncthreads(); 373 U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 374 __syncthreads(); 375 } 376 } 377 378 template <int NUM_COMP, int Q_1D, int T_1D> 379 inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 380 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 381 382 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 383 if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 384 __syncthreads(); 385 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 386 __syncthreads(); 387 } 388 } 389 390 //------------------------------------------------------------------------------ 391 // 3D interpolate to quadrature points 392 //------------------------------------------------------------------------------ 393 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 394 inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 395 CeedScalar *__restrict__ r_V) { 396 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 397 CeedScalar r_t1[1], r_t2[1]; 398 399 QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 400 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 401 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 402 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 403 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 404 } 405 QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 406 } 407 408 //------------------------------------------------------------------------------ 409 // 3D interpolate transpose 410 //------------------------------------------------------------------------------ 411 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 412 inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 413 CeedScalar *__restrict__ r_V) { 414 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 415 CeedScalar r_t1[1], r_t2[1]; 416 417 QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 418 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 419 ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 420 ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 421 ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 422 } 423 QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 424 } 425 426 //------------------------------------------------------------------------------ 427 // 3D derivatives at quadrature points 428 //------------------------------------------------------------------------------ 429 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 430 inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 431 CeedScalar *__restrict__ r_V) { 432 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 433 CeedScalar r_t1[1], r_t2[1]; 434 435 QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 436 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 437 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 438 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 439 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 440 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 441 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 442 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 443 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 444 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 445 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 446 } 447 QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 448 } 449 450 //------------------------------------------------------------------------------ 451 // 3D derivatives transpose 452 //------------------------------------------------------------------------------ 453 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 454 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 455 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 456 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 457 CeedScalar r_t1[1], r_t2[1]; 458 459 QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 460 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 461 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 462 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 463 ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 464 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 465 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 466 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 467 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 468 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 469 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 470 } 471 QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 472 } 473 474 //------------------------------------------------------------------------------ 475 // 3D derivatives at quadrature points 476 //------------------------------------------------------------------------------ 477 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 478 inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 479 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 480 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 481 CeedScalar r_t1[1], r_t2[1]; 482 483 QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 484 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 485 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 486 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 487 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 488 ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 489 ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 490 ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 491 } 492 QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 493 } 494 495 //------------------------------------------------------------------------------ 496 // 3D derivatives transpose 497 //------------------------------------------------------------------------------ 498 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 499 inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 500 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 501 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 502 CeedScalar r_t1[1], r_t2[1]; 503 504 QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 505 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 506 ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); 507 ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); 508 ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); 509 ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 510 ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 511 ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 512 } 513 QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 514 } 515 516 //------------------------------------------------------------------------------ 517 // 3D quadrature weights 518 //------------------------------------------------------------------------------ 519 template <int P_1D, int Q_1D> 520 inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 521 const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 522 523 *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 524 } 525