1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 2D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 2D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 20 inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 21 CeedScalar *V) { 22 data.slice[t_id_x + t_id_y * T_1D] = *U; 23 __syncthreads(); 24 *V = 0.0; 25 if (t_id_x < Q_1D && t_id_y < P_1D) { 26 for (CeedInt i = 0; i < P_1D; i++) { 27 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 28 } 29 } 30 __syncthreads(); 31 } 32 33 //------------------------------------------------------------------------------ 34 // 2D tensor contract y 35 //------------------------------------------------------------------------------ 36 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 37 inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 38 CeedScalar *V) { 39 data.slice[t_id_x + t_id_y * T_1D] = *U; 40 __syncthreads(); 41 *V = 0.0; 42 if (t_id_x < Q_1D && t_id_y < Q_1D) { 43 for (CeedInt i = 0; i < P_1D; i++) { 44 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 45 } 46 } 47 __syncthreads(); 48 } 49 50 //------------------------------------------------------------------------------ 51 // 2D transpose tensor contract y 52 //------------------------------------------------------------------------------ 53 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 54 inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 55 const CeedScalar *B, CeedScalar *V) { 56 data.slice[t_id_x + t_id_y * T_1D] = *U; 57 __syncthreads(); 58 *V = 0.0; 59 if (t_id_x < Q_1D && t_id_y < P_1D) { 60 for (CeedInt i = 0; i < Q_1D; i++) { 61 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 62 } 63 } 64 __syncthreads(); 65 } 66 67 //------------------------------------------------------------------------------ 68 // 2D transpose tensor contract x 69 //------------------------------------------------------------------------------ 70 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 71 inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 72 const CeedScalar *B, CeedScalar *V) { 73 data.slice[t_id_x + t_id_y * T_1D] = *U; 74 __syncthreads(); 75 *V = 0.0; 76 if (t_id_x < P_1D && t_id_y < P_1D) { 77 for (CeedInt i = 0; i < Q_1D; i++) { 78 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 79 } 80 } 81 __syncthreads(); 82 } 83 84 //------------------------------------------------------------------------------ 85 // 2D transpose tensor contract and add x 86 //------------------------------------------------------------------------------ 87 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 88 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 89 const CeedScalar *B, CeedScalar *V) { 90 data.slice[t_id_x + t_id_y * T_1D] = *U; 91 __syncthreads(); 92 if (t_id_x < P_1D && t_id_y < P_1D) { 93 for (CeedInt i = 0; i < Q_1D; i++) { 94 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 95 } 96 } 97 __syncthreads(); 98 } 99 100 //------------------------------------------------------------------------------ 101 // 2D pack/unpack quadrature values 102 //------------------------------------------------------------------------------ 103 template <int NUM_COMP, int Q_1D, int T_1D> 104 inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 105 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 106 107 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 109 __syncthreads(); 110 U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 111 __syncthreads(); 112 } 113 } 114 115 template <int NUM_COMP, int Q_1D, int T_1D> 116 inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 117 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 118 119 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120 if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 121 __syncthreads(); 122 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 123 __syncthreads(); 124 } 125 } 126 127 //------------------------------------------------------------------------------ 128 // 2D interpolate to quadrature points 129 //------------------------------------------------------------------------------ 130 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 131 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 132 CeedScalar *__restrict__ r_V) { 133 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 134 CeedScalar r_t[1]; 135 136 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 137 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 139 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 140 } 141 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 142 if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 143 } 144 145 //------------------------------------------------------------------------------ 146 // 2D interpolate transpose 147 //------------------------------------------------------------------------------ 148 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 149 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 150 CeedScalar *__restrict__ r_V) { 151 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 152 CeedScalar r_t[1]; 153 154 if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 155 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 156 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 157 ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 158 } 159 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 160 } 161 162 //------------------------------------------------------------------------------ 163 // 2D derivatives at quadrature points 164 //------------------------------------------------------------------------------ 165 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 166 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 167 CeedScalar *__restrict__ r_V) { 168 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 169 CeedScalar r_t[1]; 170 171 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 172 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 173 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 174 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 175 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 176 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 177 } 178 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 179 if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 180 } 181 182 //------------------------------------------------------------------------------ 183 // 2D derivatives transpose 184 //------------------------------------------------------------------------------ 185 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 186 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 187 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 188 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 189 CeedScalar r_t[1]; 190 191 if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 192 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 193 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 194 ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 195 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 196 ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 197 } 198 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 199 } 200 201 //------------------------------------------------------------------------------ 202 // 2D quadrature weights 203 //------------------------------------------------------------------------------ 204 template <int P_1D, int Q_1D> 205 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 206 const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 207 208 *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 209 } 210 211 //------------------------------------------------------------------------------ 212 // 3D 213 //------------------------------------------------------------------------------ 214 215 //------------------------------------------------------------------------------ 216 // 3D tensor contract x 217 //------------------------------------------------------------------------------ 218 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 219 inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 220 const CeedScalar *B, CeedScalar *V) { 221 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 222 __syncthreads(); 223 *V = 0.0; 224 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 225 for (CeedInt i = 0; i < P_1D; i++) { 226 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 227 } 228 } 229 __syncthreads(); 230 } 231 232 //------------------------------------------------------------------------------ 233 // 3D tensor contract y 234 //------------------------------------------------------------------------------ 235 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 236 inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 237 const CeedScalar *B, CeedScalar *V) { 238 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 239 __syncthreads(); 240 *V = 0.0; 241 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 242 for (CeedInt i = 0; i < P_1D; i++) { 243 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 244 } 245 } 246 __syncthreads(); 247 } 248 249 //------------------------------------------------------------------------------ 250 // 3D tensor contract z 251 //------------------------------------------------------------------------------ 252 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 253 inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 254 const CeedScalar *B, CeedScalar *V) { 255 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 256 __syncthreads(); 257 *V = 0.0; 258 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 259 for (CeedInt i = 0; i < P_1D; i++) { 260 *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 261 } 262 } 263 __syncthreads(); 264 } 265 266 //------------------------------------------------------------------------------ 267 // 3D tensor contract z 268 //------------------------------------------------------------------------------ 269 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 270 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 271 const CeedScalar *B, CeedScalar *V) { 272 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 273 __syncthreads(); 274 *V = 0.0; 275 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 276 for (CeedInt i = 0; i < Q_1D; i++) { 277 *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 278 } 279 } 280 __syncthreads(); 281 } 282 283 //------------------------------------------------------------------------------ 284 // 3D transpose tensor contract z 285 //------------------------------------------------------------------------------ 286 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 287 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 288 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 289 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 290 __syncthreads(); 291 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 292 for (CeedInt i = 0; i < Q_1D; i++) { 293 *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 294 } 295 } 296 __syncthreads(); 297 } 298 299 //------------------------------------------------------------------------------ 300 // 3D transpose tensor contract y 301 //------------------------------------------------------------------------------ 302 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 303 inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 304 const CeedScalar *B, CeedScalar *V) { 305 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 306 __syncthreads(); 307 *V = 0.0; 308 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 309 for (CeedInt i = 0; i < Q_1D; i++) { 310 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 311 } 312 } 313 __syncthreads(); 314 } 315 316 //------------------------------------------------------------------------------ 317 // 3D transpose tensor contract y 318 //------------------------------------------------------------------------------ 319 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 320 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 321 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 322 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 323 __syncthreads(); 324 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 325 for (CeedInt i = 0; i < Q_1D; i++) { 326 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 327 } 328 } 329 __syncthreads(); 330 } 331 332 //------------------------------------------------------------------------------ 333 // 3D transpose tensor contract x 334 //------------------------------------------------------------------------------ 335 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 336 inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 337 const CeedScalar *B, CeedScalar *V) { 338 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 339 __syncthreads(); 340 *V = 0.0; 341 if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 342 for (CeedInt i = 0; i < Q_1D; i++) { 343 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 344 } 345 } 346 __syncthreads(); 347 } 348 349 //------------------------------------------------------------------------------ 350 // 3D transpose tensor contract add x 351 //------------------------------------------------------------------------------ 352 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 353 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 354 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 355 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 356 __syncthreads(); 357 if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 358 for (CeedInt i = 0; i < Q_1D; i++) { 359 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 360 } 361 } 362 __syncthreads(); 363 } 364 365 //------------------------------------------------------------------------------ 366 // 3D pack/unpack quadrature values 367 //------------------------------------------------------------------------------ 368 template <int NUM_COMP, int Q_1D, int T_1D> 369 inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 370 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 371 372 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 373 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 374 __syncthreads(); 375 U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 376 __syncthreads(); 377 } 378 } 379 380 template <int NUM_COMP, int Q_1D, int T_1D> 381 inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 382 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 383 384 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 385 if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 386 __syncthreads(); 387 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 388 __syncthreads(); 389 } 390 } 391 392 //------------------------------------------------------------------------------ 393 // 3D interpolate to quadrature points 394 //------------------------------------------------------------------------------ 395 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 396 inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 397 CeedScalar *__restrict__ r_V) { 398 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 399 CeedScalar r_t1[1], r_t2[1]; 400 401 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 402 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 403 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 404 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 405 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 406 } 407 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 408 if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 409 } 410 411 //------------------------------------------------------------------------------ 412 // 3D interpolate transpose 413 //------------------------------------------------------------------------------ 414 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 415 inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 416 CeedScalar *__restrict__ r_V) { 417 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 418 CeedScalar r_t1[1], r_t2[1]; 419 420 if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 421 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 422 ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 423 ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 424 ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 425 } 426 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 427 } 428 429 //------------------------------------------------------------------------------ 430 // 3D derivatives at quadrature points 431 //------------------------------------------------------------------------------ 432 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 433 inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 434 CeedScalar *__restrict__ r_V) { 435 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 436 CeedScalar r_t1[1], r_t2[1]; 437 438 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 439 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 440 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 441 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 442 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 443 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 444 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 445 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 446 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 447 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 448 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 449 } 450 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 451 if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 452 } 453 454 //------------------------------------------------------------------------------ 455 // 3D derivatives transpose 456 //------------------------------------------------------------------------------ 457 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 458 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 459 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 460 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 461 CeedScalar r_t1[1], r_t2[1]; 462 463 if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 464 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 465 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 466 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 467 ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 468 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 469 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 470 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 471 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 472 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 473 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 474 } 475 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 476 } 477 478 //------------------------------------------------------------------------------ 479 // 3D derivatives at quadrature points 480 //------------------------------------------------------------------------------ 481 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 482 inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 483 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 484 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 485 CeedScalar r_t1[1], r_t2[1]; 486 487 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 488 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 489 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 490 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 491 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 492 ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 493 ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 494 ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 495 } 496 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 497 if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 498 } 499 500 //------------------------------------------------------------------------------ 501 // 3D derivatives transpose 502 //------------------------------------------------------------------------------ 503 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 504 inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 505 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 506 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 507 CeedScalar r_t1[1], r_t2[1]; 508 509 if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 510 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 511 ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); 512 ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); 513 ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); 514 ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 515 ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 516 ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 517 } 518 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 519 } 520 521 //------------------------------------------------------------------------------ 522 // 3D quadrature weights 523 //------------------------------------------------------------------------------ 524 template <int P_1D, int Q_1D> 525 inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 526 const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 527 528 *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 529 } 530