1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // 2D 14 //------------------------------------------------------------------------------ 15 16 //------------------------------------------------------------------------------ 17 // 2D tensor contraction x 18 //------------------------------------------------------------------------------ 19 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 20 inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 21 CeedScalar *V) { 22 __syncthreads(); 23 data.slice[t_id_x + t_id_y * T_1D] = *U; 24 __syncthreads(); 25 *V = 0.0; 26 if (t_id_x < Q_1D && t_id_y < P_1D) { 27 for (CeedInt i = 0; i < P_1D; i++) { 28 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 29 } 30 } 31 } 32 33 //------------------------------------------------------------------------------ 34 // 2D tensor contract y 35 //------------------------------------------------------------------------------ 36 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 37 inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, 38 CeedScalar *V) { 39 __syncthreads(); 40 data.slice[t_id_x + t_id_y * T_1D] = *U; 41 __syncthreads(); 42 *V = 0.0; 43 if (t_id_x < Q_1D && t_id_y < Q_1D) { 44 for (CeedInt i = 0; i < P_1D; i++) { 45 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 46 } 47 } 48 } 49 50 //------------------------------------------------------------------------------ 51 // 2D transpose tensor contract y 52 //------------------------------------------------------------------------------ 53 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 54 inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 55 const CeedScalar *B, CeedScalar *V) { 56 __syncthreads(); 57 data.slice[t_id_x + t_id_y * T_1D] = *U; 58 __syncthreads(); 59 *V = 0.0; 60 if (t_id_x < Q_1D && t_id_y < P_1D) { 61 for (CeedInt i = 0; i < Q_1D; i++) { 62 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction 63 } 64 } 65 } 66 67 //------------------------------------------------------------------------------ 68 // 2D transpose tensor contract x 69 //------------------------------------------------------------------------------ 70 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 71 inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 72 const CeedScalar *B, CeedScalar *V) { 73 __syncthreads(); 74 data.slice[t_id_x + t_id_y * T_1D] = *U; 75 __syncthreads(); 76 *V = 0.0; 77 if (t_id_x < P_1D && t_id_y < P_1D) { 78 for (CeedInt i = 0; i < Q_1D; i++) { 79 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 80 } 81 } 82 } 83 84 //------------------------------------------------------------------------------ 85 // 2D transpose tensor contract and add x 86 //------------------------------------------------------------------------------ 87 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 88 inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, 89 const CeedScalar *B, CeedScalar *V) { 90 __syncthreads(); 91 data.slice[t_id_x + t_id_y * T_1D] = *U; 92 __syncthreads(); 93 if (t_id_x < P_1D && t_id_y < P_1D) { 94 for (CeedInt i = 0; i < Q_1D; i++) { 95 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction 96 } 97 } 98 } 99 100 //------------------------------------------------------------------------------ 101 // 2D pack/unpack quadrature values 102 //------------------------------------------------------------------------------ 103 template <int NUM_COMP, int Q_1D, int T_1D> 104 inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 105 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; 106 107 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 108 __syncthreads(); 109 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp]; 110 __syncthreads(); 111 U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0; 112 } 113 } 114 115 template <int NUM_COMP, int Q_1D, int T_1D> 116 inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { 117 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; 118 119 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 120 __syncthreads(); 121 if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp]; 122 __syncthreads(); 123 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0; 124 } 125 } 126 127 //------------------------------------------------------------------------------ 128 // 2D interpolate to quadrature points 129 //------------------------------------------------------------------------------ 130 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 131 inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 132 CeedScalar *__restrict__ r_V) { 133 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 134 CeedScalar r_t[1]; 135 136 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 137 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 139 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 140 } 141 __syncthreads(); 142 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 143 if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 144 } 145 146 //------------------------------------------------------------------------------ 147 // 2D interpolate transpose 148 //------------------------------------------------------------------------------ 149 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 150 inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 151 CeedScalar *__restrict__ r_V) { 152 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 153 CeedScalar r_t[1]; 154 155 if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 156 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 157 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 158 ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 159 } 160 __syncthreads(); 161 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 162 } 163 164 //------------------------------------------------------------------------------ 165 // 2D derivatives at quadrature points 166 //------------------------------------------------------------------------------ 167 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 168 inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 169 CeedScalar *__restrict__ r_V) { 170 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 171 CeedScalar r_t[1]; 172 173 if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 174 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 175 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t); 176 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]); 177 ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t); 178 ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]); 179 } 180 __syncthreads(); 181 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U); 182 if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V); 183 } 184 185 //------------------------------------------------------------------------------ 186 // 2D derivatives transpose 187 //------------------------------------------------------------------------------ 188 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 189 inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 190 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 191 const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; 192 CeedScalar r_t[1]; 193 194 if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U); 195 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 196 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t); 197 ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]); 198 ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t); 199 ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]); 200 } 201 __syncthreads(); 202 if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V); 203 } 204 205 //------------------------------------------------------------------------------ 206 // 2D quadrature weights 207 //------------------------------------------------------------------------------ 208 template <int P_1D, int Q_1D> 209 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 210 const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; 211 212 *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; 213 } 214 215 //------------------------------------------------------------------------------ 216 // 3D 217 //------------------------------------------------------------------------------ 218 219 //------------------------------------------------------------------------------ 220 // 3D tensor contract x 221 //------------------------------------------------------------------------------ 222 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 223 inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 224 const CeedScalar *B, CeedScalar *V) { 225 __syncthreads(); 226 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 227 __syncthreads(); 228 *V = 0.0; 229 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 230 for (CeedInt i = 0; i < P_1D; i++) { 231 *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 232 } 233 } 234 } 235 236 //------------------------------------------------------------------------------ 237 // 3D tensor contract y 238 //------------------------------------------------------------------------------ 239 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 240 inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 241 const CeedScalar *B, CeedScalar *V) { 242 __syncthreads(); 243 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 244 __syncthreads(); 245 *V = 0.0; 246 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 247 for (CeedInt i = 0; i < P_1D; i++) { 248 *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 249 } 250 } 251 } 252 253 //------------------------------------------------------------------------------ 254 // 3D tensor contract z 255 //------------------------------------------------------------------------------ 256 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 257 inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 258 const CeedScalar *B, CeedScalar *V) { 259 __syncthreads(); 260 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 261 __syncthreads(); 262 *V = 0.0; 263 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) { 264 for (CeedInt i = 0; i < P_1D; i++) { 265 *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 266 } 267 } 268 } 269 270 //------------------------------------------------------------------------------ 271 // 3D tensor contract z 272 //------------------------------------------------------------------------------ 273 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 274 inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 275 const CeedScalar *B, CeedScalar *V) { 276 __syncthreads(); 277 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 278 __syncthreads(); 279 *V = 0.0; 280 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 281 for (CeedInt i = 0; i < Q_1D; i++) { 282 *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 283 } 284 } 285 } 286 287 //------------------------------------------------------------------------------ 288 // 3D transpose tensor contract z 289 //------------------------------------------------------------------------------ 290 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 291 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 292 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 293 __syncthreads(); 294 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 295 __syncthreads(); 296 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) { 297 for (CeedInt i = 0; i < Q_1D; i++) { 298 *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction 299 } 300 } 301 } 302 303 //------------------------------------------------------------------------------ 304 // 3D transpose tensor contract y 305 //------------------------------------------------------------------------------ 306 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 307 inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 308 const CeedScalar *B, CeedScalar *V) { 309 __syncthreads(); 310 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 311 __syncthreads(); 312 *V = 0.0; 313 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 314 for (CeedInt i = 0; i < Q_1D; i++) { 315 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 316 } 317 } 318 } 319 320 //------------------------------------------------------------------------------ 321 // 3D transpose tensor contract y 322 //------------------------------------------------------------------------------ 323 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 324 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 325 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 326 __syncthreads(); 327 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 328 __syncthreads(); 329 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) { 330 for (CeedInt i = 0; i < Q_1D; i++) { 331 *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction 332 } 333 } 334 } 335 336 //------------------------------------------------------------------------------ 337 // 3D transpose tensor contract x 338 //------------------------------------------------------------------------------ 339 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 340 inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, 341 const CeedScalar *B, CeedScalar *V) { 342 __syncthreads(); 343 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 344 __syncthreads(); 345 *V = 0.0; 346 if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 347 for (CeedInt i = 0; i < Q_1D; i++) { 348 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 349 } 350 } 351 } 352 353 //------------------------------------------------------------------------------ 354 // 3D transpose tensor contract add x 355 //------------------------------------------------------------------------------ 356 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 357 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, 358 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { 359 __syncthreads(); 360 data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; 361 __syncthreads(); 362 if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) { 363 for (CeedInt i = 0; i < Q_1D; i++) { 364 *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction 365 } 366 } 367 } 368 369 //------------------------------------------------------------------------------ 370 // 3D pack/unpack quadrature values 371 //------------------------------------------------------------------------------ 372 template <int NUM_COMP, int Q_1D, int T_1D> 373 inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 374 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); 375 376 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 377 __syncthreads(); 378 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp]; 379 __syncthreads(); 380 U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0; 381 } 382 } 383 384 template <int NUM_COMP, int Q_1D, int T_1D> 385 inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { 386 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); 387 388 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 389 __syncthreads(); 390 if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp]; 391 __syncthreads(); 392 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0; 393 } 394 } 395 396 //------------------------------------------------------------------------------ 397 // 3D interpolate to quadrature points 398 //------------------------------------------------------------------------------ 399 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 400 inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 401 CeedScalar *__restrict__ r_V) { 402 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 403 CeedScalar r_t1[1], r_t2[1]; 404 405 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 406 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 407 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 408 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 409 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 410 } 411 __syncthreads(); 412 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 413 if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 414 } 415 416 //------------------------------------------------------------------------------ 417 // 3D interpolate transpose 418 //------------------------------------------------------------------------------ 419 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 420 inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 421 CeedScalar *__restrict__ r_V) { 422 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 423 CeedScalar r_t1[1], r_t2[1]; 424 425 if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 426 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 427 ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 428 ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 429 ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 430 } 431 __syncthreads(); 432 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 433 } 434 435 //------------------------------------------------------------------------------ 436 // 3D derivatives at quadrature points 437 //------------------------------------------------------------------------------ 438 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 439 inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, 440 CeedScalar *__restrict__ r_V) { 441 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 442 CeedScalar r_t1[1], r_t2[1]; 443 444 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 445 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 446 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1); 447 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 448 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]); 449 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 450 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2); 451 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]); 452 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 453 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 454 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]); 455 } 456 __syncthreads(); 457 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 458 if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 459 } 460 461 //------------------------------------------------------------------------------ 462 // 3D derivatives transpose 463 //------------------------------------------------------------------------------ 464 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 465 inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 466 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 467 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 468 CeedScalar r_t1[1], r_t2[1]; 469 470 if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 471 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 472 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1); 473 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 474 ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]); 475 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1); 476 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2); 477 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 478 ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1); 479 ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2); 480 ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]); 481 } 482 __syncthreads(); 483 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 484 } 485 486 //------------------------------------------------------------------------------ 487 // 3D derivatives at quadrature points 488 //------------------------------------------------------------------------------ 489 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 490 inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 491 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 492 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 493 CeedScalar r_t1[1], r_t2[1]; 494 495 if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 496 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 497 ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1); 498 ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 499 ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 500 ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]); 501 ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]); 502 ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]); 503 } 504 __syncthreads(); 505 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 506 if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 507 } 508 509 //------------------------------------------------------------------------------ 510 // 3D derivatives transpose 511 //------------------------------------------------------------------------------ 512 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D> 513 inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, 514 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 515 const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); 516 CeedScalar r_t1[1], r_t2[1]; 517 518 if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U); 519 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 520 ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2); 521 ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2); 522 ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2); 523 ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1); 524 ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2); 525 ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]); 526 } 527 __syncthreads(); 528 if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V); 529 } 530 531 //------------------------------------------------------------------------------ 532 // 3D quadrature weights 533 //------------------------------------------------------------------------------ 534 template <int P_1D, int Q_1D> 535 inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { 536 const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); 537 538 *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; 539 } 540