1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
29b91271bSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
39b91271bSJeremy L Thompson //
49b91271bSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
59b91271bSJeremy L Thompson //
69b91271bSJeremy L Thompson // This file is part of CEED: http://github.com/ceed
79b91271bSJeremy L Thompson
89b91271bSJeremy L Thompson /// @file
99b91271bSJeremy L Thompson /// Internal header for HIP shared memory tensor product basis templates
109b91271bSJeremy L Thompson #include <ceed/types.h>
119b91271bSJeremy L Thompson
129b91271bSJeremy L Thompson //------------------------------------------------------------------------------
139b91271bSJeremy L Thompson // 2D
149b91271bSJeremy L Thompson //------------------------------------------------------------------------------
159b91271bSJeremy L Thompson
169b91271bSJeremy L Thompson //------------------------------------------------------------------------------
179b91271bSJeremy L Thompson // 2D tensor contraction x
189b91271bSJeremy L Thompson //------------------------------------------------------------------------------
199b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX2dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)209b91271bSJeremy L Thompson inline __device__ void ContractX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
219b91271bSJeremy L Thompson CeedScalar *V) {
22d6c19ee8SJeremy L Thompson __syncthreads();
239b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U;
249b91271bSJeremy L Thompson __syncthreads();
259b91271bSJeremy L Thompson *V = 0.0;
269b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) {
279b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) {
289b91271bSJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction
299b91271bSJeremy L Thompson }
309b91271bSJeremy L Thompson }
319b91271bSJeremy L Thompson }
329b91271bSJeremy L Thompson
339b91271bSJeremy L Thompson //------------------------------------------------------------------------------
349b91271bSJeremy L Thompson // 2D tensor contract y
359b91271bSJeremy L Thompson //------------------------------------------------------------------------------
369b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY2dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)379b91271bSJeremy L Thompson inline __device__ void ContractY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
389b91271bSJeremy L Thompson CeedScalar *V) {
39d6c19ee8SJeremy L Thompson __syncthreads();
409b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U;
419b91271bSJeremy L Thompson __syncthreads();
429b91271bSJeremy L Thompson *V = 0.0;
439b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) {
449b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) {
459b91271bSJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction
469b91271bSJeremy L Thompson }
479b91271bSJeremy L Thompson }
489b91271bSJeremy L Thompson }
499b91271bSJeremy L Thompson
509b91271bSJeremy L Thompson //------------------------------------------------------------------------------
519b91271bSJeremy L Thompson // 2D transpose tensor contract y
529b91271bSJeremy L Thompson //------------------------------------------------------------------------------
539b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY2dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)549b91271bSJeremy L Thompson inline __device__ void ContractTransposeY2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
559b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) {
56d6c19ee8SJeremy L Thompson __syncthreads();
579b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U;
589b91271bSJeremy L Thompson __syncthreads();
599b91271bSJeremy L Thompson *V = 0.0;
609b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D) {
619b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
629b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction
639b91271bSJeremy L Thompson }
649b91271bSJeremy L Thompson }
659b91271bSJeremy L Thompson }
669b91271bSJeremy L Thompson
679b91271bSJeremy L Thompson //------------------------------------------------------------------------------
689b91271bSJeremy L Thompson // 2D transpose tensor contract x
699b91271bSJeremy L Thompson //------------------------------------------------------------------------------
709b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX2dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)719b91271bSJeremy L Thompson inline __device__ void ContractTransposeX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
729b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) {
73d6c19ee8SJeremy L Thompson __syncthreads();
749b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U;
759b91271bSJeremy L Thompson __syncthreads();
769b91271bSJeremy L Thompson *V = 0.0;
779b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) {
789b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
799b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction
809b91271bSJeremy L Thompson }
819b91271bSJeremy L Thompson }
829b91271bSJeremy L Thompson }
839b91271bSJeremy L Thompson
849b91271bSJeremy L Thompson //------------------------------------------------------------------------------
859b91271bSJeremy L Thompson // 2D transpose tensor contract and add x
869b91271bSJeremy L Thompson //------------------------------------------------------------------------------
879b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX2dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)889b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
899b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) {
90d6c19ee8SJeremy L Thompson __syncthreads();
919b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D] = *U;
929b91271bSJeremy L Thompson __syncthreads();
939b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D) {
949b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
959b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction
969b91271bSJeremy L Thompson }
979b91271bSJeremy L Thompson }
989b91271bSJeremy L Thompson }
999b91271bSJeremy L Thompson
1009b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1019b91271bSJeremy L Thompson // 2D pack/unpack quadrature values
1029b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1039b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
QPack2d(SharedData_Hip & data,const int t_id_x,const int t_id_y,CeedScalar * U)1049b91271bSJeremy L Thompson inline __device__ void QPack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
1059b91271bSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
1069b91271bSJeremy L Thompson
1079b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108d6c19ee8SJeremy L Thompson __syncthreads();
1099b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
1109b91271bSJeremy L Thompson __syncthreads();
1119b91271bSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
1129b91271bSJeremy L Thompson }
1139b91271bSJeremy L Thompson }
1149b91271bSJeremy L Thompson
1159b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
QUnpack2d(SharedData_Hip & data,const int t_id_x,const int t_id_y,CeedScalar * U)1169b91271bSJeremy L Thompson inline __device__ void QUnpack2d(SharedData_Hip &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
1179b91271bSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
1189b91271bSJeremy L Thompson
1199b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120d6c19ee8SJeremy L Thompson __syncthreads();
1219b91271bSJeremy L Thompson if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
1229b91271bSJeremy L Thompson __syncthreads();
1239b91271bSJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
1249b91271bSJeremy L Thompson }
1259b91271bSJeremy L Thompson }
1269b91271bSJeremy L Thompson
1279b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1289b91271bSJeremy L Thompson // 2D interpolate to quadrature points
1299b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1309b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor2dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)1319b91271bSJeremy L Thompson inline __device__ void InterpTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1329b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
1339b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1349b91271bSJeremy L Thompson CeedScalar r_t[1];
1359b91271bSJeremy L Thompson
136ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1379b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1389b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
1399b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
1409b91271bSJeremy L Thompson }
1413e2e790dSJeremy L Thompson __syncthreads();
142ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
143ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1449b91271bSJeremy L Thompson }
1459b91271bSJeremy L Thompson
1469b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1479b91271bSJeremy L Thompson // 2D interpolate transpose
1489b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1499b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor2dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)1509b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1519b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
1529b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1539b91271bSJeremy L Thompson CeedScalar r_t[1];
1549b91271bSJeremy L Thompson
155ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1569b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1579b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
1589b91271bSJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
1599b91271bSJeremy L Thompson }
1603e2e790dSJeremy L Thompson __syncthreads();
161ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1629b91271bSJeremy L Thompson }
1639b91271bSJeremy L Thompson
1649b91271bSJeremy L Thompson //------------------------------------------------------------------------------
1650ccda8ebSJeremy L Thompson // 2D interpolate to quadrature points, nodes and quadrature points collocated
1660ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
1670ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes2dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)1680ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1690ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) {
1700ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1710ccda8ebSJeremy L Thompson
1720ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1730ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1740ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp];
1750ccda8ebSJeremy L Thompson }
1760ccda8ebSJeremy L Thompson __syncthreads();
1770ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1780ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1790ccda8ebSJeremy L Thompson }
1800ccda8ebSJeremy L Thompson
1810ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
1820ccda8ebSJeremy L Thompson // 2D interpolate transpose, nodes and quadrature points collocated
1830ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
1840ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)1850ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
1860ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) {
1870ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
1880ccda8ebSJeremy L Thompson
1890ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
1900ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
1910ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp];
1920ccda8ebSJeremy L Thompson }
1930ccda8ebSJeremy L Thompson __syncthreads();
1940ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
1950ccda8ebSJeremy L Thompson }
1960ccda8ebSJeremy L Thompson
1970ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
1989b91271bSJeremy L Thompson // 2D derivatives at quadrature points
1999b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2009b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor2dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)2019b91271bSJeremy L Thompson inline __device__ void GradTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
2029b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
2039b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
2049b91271bSJeremy L Thompson CeedScalar r_t[1];
2059b91271bSJeremy L Thompson
206ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
2079b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2089b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
2099b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
2109b91271bSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
2119b91271bSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
2129b91271bSJeremy L Thompson }
2133e2e790dSJeremy L Thompson __syncthreads();
214ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
215ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
2169b91271bSJeremy L Thompson }
2179b91271bSJeremy L Thompson
2189b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2199b91271bSJeremy L Thompson // 2D derivatives transpose
2209b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2219b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor2dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)2229b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2239b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
2249b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
2259b91271bSJeremy L Thompson CeedScalar r_t[1];
2269b91271bSJeremy L Thompson
227ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
2289b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2299b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
2309b91271bSJeremy L Thompson ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
2319b91271bSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
2329b91271bSJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
2339b91271bSJeremy L Thompson }
2343e2e790dSJeremy L Thompson __syncthreads();
235ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
2369b91271bSJeremy L Thompson }
2379b91271bSJeremy L Thompson
2389b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2390ccda8ebSJeremy L Thompson // 2D derivatives at quadrature points, nodes and quadrature points collocated
2400ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2410ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes2dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)2420ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2430ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
2440ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
2450ccda8ebSJeremy L Thompson
2460ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
2470ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2480ccda8ebSJeremy L Thompson ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
2490ccda8ebSJeremy L Thompson ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
2500ccda8ebSJeremy L Thompson }
2510ccda8ebSJeremy L Thompson __syncthreads();
2520ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
2530ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
2540ccda8ebSJeremy L Thompson }
2550ccda8ebSJeremy L Thompson
2560ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2570ccda8ebSJeremy L Thompson // 2D derivatives transpose, nodes and quadrature points collocated
2580ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2590ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)2600ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes2dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
2610ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
2620ccda8ebSJeremy L Thompson const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
2630ccda8ebSJeremy L Thompson
2640ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
2650ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
2660ccda8ebSJeremy L Thompson ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
2670ccda8ebSJeremy L Thompson ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
2680ccda8ebSJeremy L Thompson }
2690ccda8ebSJeremy L Thompson __syncthreads();
2700ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
2710ccda8ebSJeremy L Thompson }
2720ccda8ebSJeremy L Thompson
2730ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
2749b91271bSJeremy L Thompson // 2D quadrature weights
2759b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2769b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
WeightTensor2dFlattened(SharedData_Hip & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)2779b91271bSJeremy L Thompson inline __device__ void WeightTensor2dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
2789b91271bSJeremy L Thompson const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
2799b91271bSJeremy L Thompson
2809b91271bSJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
2819b91271bSJeremy L Thompson }
2829b91271bSJeremy L Thompson
2839b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2849b91271bSJeremy L Thompson // 3D
2859b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2869b91271bSJeremy L Thompson
2879b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2889b91271bSJeremy L Thompson // 3D tensor contract x
2899b91271bSJeremy L Thompson //------------------------------------------------------------------------------
2909b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX3dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)2919b91271bSJeremy L Thompson inline __device__ void ContractX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
2929b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) {
293d6c19ee8SJeremy L Thompson __syncthreads();
2949b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
2959b91271bSJeremy L Thompson __syncthreads();
2969b91271bSJeremy L Thompson *V = 0.0;
2979b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
2989b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) {
2999b91271bSJeremy L Thompson *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
3009b91271bSJeremy L Thompson }
3019b91271bSJeremy L Thompson }
3029b91271bSJeremy L Thompson }
3039b91271bSJeremy L Thompson
3049b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3059b91271bSJeremy L Thompson // 3D tensor contract y
3069b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3079b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY3dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)3089b91271bSJeremy L Thompson inline __device__ void ContractY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3099b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) {
310d6c19ee8SJeremy L Thompson __syncthreads();
3119b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3129b91271bSJeremy L Thompson __syncthreads();
3139b91271bSJeremy L Thompson *V = 0.0;
3149b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
3159b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) {
3169b91271bSJeremy L Thompson *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
3179b91271bSJeremy L Thompson }
3189b91271bSJeremy L Thompson }
3199b91271bSJeremy L Thompson }
3209b91271bSJeremy L Thompson
3219b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3229b91271bSJeremy L Thompson // 3D tensor contract z
3239b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3249b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractZ3dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)3259b91271bSJeremy L Thompson inline __device__ void ContractZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3269b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) {
327d6c19ee8SJeremy L Thompson __syncthreads();
3289b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3299b91271bSJeremy L Thompson __syncthreads();
3309b91271bSJeremy L Thompson *V = 0.0;
3319b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
3329b91271bSJeremy L Thompson for (CeedInt i = 0; i < P_1D; i++) {
3339b91271bSJeremy L Thompson *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
3349b91271bSJeremy L Thompson }
3359b91271bSJeremy L Thompson }
3369b91271bSJeremy L Thompson }
3379b91271bSJeremy L Thompson
3389b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3399b91271bSJeremy L Thompson // 3D tensor contract z
3409b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3419b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeZ3dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)3429b91271bSJeremy L Thompson inline __device__ void ContractTransposeZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3439b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) {
344d6c19ee8SJeremy L Thompson __syncthreads();
3459b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3469b91271bSJeremy L Thompson __syncthreads();
3479b91271bSJeremy L Thompson *V = 0.0;
3489b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
3499b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3509b91271bSJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
3519b91271bSJeremy L Thompson }
3529b91271bSJeremy L Thompson }
3539b91271bSJeremy L Thompson }
3549b91271bSJeremy L Thompson
3559b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3569b91271bSJeremy L Thompson // 3D transpose tensor contract z
3579b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3589b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddZ3dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)3599b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
3609b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
361d6c19ee8SJeremy L Thompson __syncthreads();
3629b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3639b91271bSJeremy L Thompson __syncthreads();
3649b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
3659b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3669b91271bSJeremy L Thompson *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
3679b91271bSJeremy L Thompson }
3689b91271bSJeremy L Thompson }
3699b91271bSJeremy L Thompson }
3709b91271bSJeremy L Thompson
3719b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3729b91271bSJeremy L Thompson // 3D transpose tensor contract y
3739b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3749b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY3dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)3759b91271bSJeremy L Thompson inline __device__ void ContractTransposeY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
3769b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) {
377d6c19ee8SJeremy L Thompson __syncthreads();
3789b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3799b91271bSJeremy L Thompson __syncthreads();
3809b91271bSJeremy L Thompson *V = 0.0;
3819b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
3829b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3839b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
3849b91271bSJeremy L Thompson }
3859b91271bSJeremy L Thompson }
3869b91271bSJeremy L Thompson }
3879b91271bSJeremy L Thompson
3889b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3899b91271bSJeremy L Thompson // 3D transpose tensor contract y
3909b91271bSJeremy L Thompson //------------------------------------------------------------------------------
3919b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddY3dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)3929b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
3939b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
394d6c19ee8SJeremy L Thompson __syncthreads();
3959b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
3969b91271bSJeremy L Thompson __syncthreads();
3979b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
3989b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
3999b91271bSJeremy L Thompson *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
4009b91271bSJeremy L Thompson }
4019b91271bSJeremy L Thompson }
4029b91271bSJeremy L Thompson }
4039b91271bSJeremy L Thompson
4049b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4059b91271bSJeremy L Thompson // 3D transpose tensor contract x
4069b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4079b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX3dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)4089b91271bSJeremy L Thompson inline __device__ void ContractTransposeX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
4099b91271bSJeremy L Thompson const CeedScalar *B, CeedScalar *V) {
410d6c19ee8SJeremy L Thompson __syncthreads();
4119b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
4129b91271bSJeremy L Thompson __syncthreads();
4139b91271bSJeremy L Thompson *V = 0.0;
4149b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
4159b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
4169b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
4179b91271bSJeremy L Thompson }
4189b91271bSJeremy L Thompson }
4199b91271bSJeremy L Thompson }
4209b91271bSJeremy L Thompson
4219b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4229b91271bSJeremy L Thompson // 3D transpose tensor contract add x
4239b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4249b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX3dFlattened(SharedData_Hip & data,const int t_id_x,const int t_id_y,const int t_id_z,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)4259b91271bSJeremy L Thompson inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z,
4269b91271bSJeremy L Thompson const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
427d6c19ee8SJeremy L Thompson __syncthreads();
4289b91271bSJeremy L Thompson data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
4299b91271bSJeremy L Thompson __syncthreads();
4309b91271bSJeremy L Thompson if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
4319b91271bSJeremy L Thompson for (CeedInt i = 0; i < Q_1D; i++) {
4329b91271bSJeremy L Thompson *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
4339b91271bSJeremy L Thompson }
4349b91271bSJeremy L Thompson }
4359b91271bSJeremy L Thompson }
4369b91271bSJeremy L Thompson
4379b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4389b91271bSJeremy L Thompson // 3D pack/unpack quadrature values
4399b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4409b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
QPack3d(SharedData_Hip & data,const int t_id_x,const int t_id_y,const int t_id_z,CeedScalar * U)4419b91271bSJeremy L Thompson inline __device__ void QPack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
4429b91271bSJeremy L Thompson const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
4439b91271bSJeremy L Thompson
4449b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
445d6c19ee8SJeremy L Thompson __syncthreads();
4469b91271bSJeremy L Thompson if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
4479b91271bSJeremy L Thompson __syncthreads();
4489b91271bSJeremy L Thompson U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
4499b91271bSJeremy L Thompson }
4509b91271bSJeremy L Thompson }
4519b91271bSJeremy L Thompson
4529b91271bSJeremy L Thompson template <int NUM_COMP, int Q_1D, int T_1D>
QUnpack3d(SharedData_Hip & data,const int t_id_x,const int t_id_y,const int t_id_z,CeedScalar * U)4539b91271bSJeremy L Thompson inline __device__ void QUnpack3d(SharedData_Hip &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
4549b91271bSJeremy L Thompson const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
4559b91271bSJeremy L Thompson
4569b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
457d6c19ee8SJeremy L Thompson __syncthreads();
4589b91271bSJeremy L Thompson if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
4599b91271bSJeremy L Thompson __syncthreads();
4609b91271bSJeremy L Thompson U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
4619b91271bSJeremy L Thompson }
4629b91271bSJeremy L Thompson }
4639b91271bSJeremy L Thompson
4649b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4659b91271bSJeremy L Thompson // 3D interpolate to quadrature points
4669b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4679b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor3dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)4689b91271bSJeremy L Thompson inline __device__ void InterpTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4699b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
4709b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4719b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1];
4729b91271bSJeremy L Thompson
473ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4749b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4759b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4769b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4779b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
4789b91271bSJeremy L Thompson }
4793e2e790dSJeremy L Thompson __syncthreads();
480ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
481ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
4829b91271bSJeremy L Thompson }
4839b91271bSJeremy L Thompson
4849b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4859b91271bSJeremy L Thompson // 3D interpolate transpose
4869b91271bSJeremy L Thompson //------------------------------------------------------------------------------
4879b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor3dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)4889b91271bSJeremy L Thompson inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
4899b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
4909b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
4919b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1];
4929b91271bSJeremy L Thompson
493ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
4949b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
4959b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
4969b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
4979b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
4989b91271bSJeremy L Thompson }
4993e2e790dSJeremy L Thompson __syncthreads();
500ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5019b91271bSJeremy L Thompson }
5029b91271bSJeremy L Thompson
5039b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5040ccda8ebSJeremy L Thompson // 3D interpolate to quadrature points, nodes and quadrature points collocated
5050ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5060ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes3dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)5070ccda8ebSJeremy L Thompson inline __device__ void InterpTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5080ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) {
5090ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5100ccda8ebSJeremy L Thompson
5110ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5120ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5130ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp];
5140ccda8ebSJeremy L Thompson }
5150ccda8ebSJeremy L Thompson __syncthreads();
5160ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5170ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5180ccda8ebSJeremy L Thompson }
5190ccda8ebSJeremy L Thompson
5200ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5210ccda8ebSJeremy L Thompson // 3D interpolate transpose, nodes and quadrature points collocated
5220ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5230ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)5240ccda8ebSJeremy L Thompson inline __device__ void InterpTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5250ccda8ebSJeremy L Thompson CeedScalar *__restrict__ r_V) {
5260ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5270ccda8ebSJeremy L Thompson
5280ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5290ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5300ccda8ebSJeremy L Thompson r_V[comp] = r_U[comp];
5310ccda8ebSJeremy L Thompson }
5320ccda8ebSJeremy L Thompson __syncthreads();
5330ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5340ccda8ebSJeremy L Thompson }
5350ccda8ebSJeremy L Thompson
5360ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
5379b91271bSJeremy L Thompson // 3D derivatives at quadrature points
5389b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5399b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor3dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)5409b91271bSJeremy L Thompson inline __device__ void GradTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
5419b91271bSJeremy L Thompson CeedScalar *__restrict__ r_V) {
5429b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5439b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1];
5449b91271bSJeremy L Thompson
545ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5469b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5479b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
5489b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
5499b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
5509b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
5519b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
5529b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
5539b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
5549b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
5559b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
5569b91271bSJeremy L Thompson }
5573e2e790dSJeremy L Thompson __syncthreads();
558ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
559ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5609b91271bSJeremy L Thompson }
5619b91271bSJeremy L Thompson
5629b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5639b91271bSJeremy L Thompson // 3D derivatives transpose
5649b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5659b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor3dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)5669b91271bSJeremy L Thompson inline __device__ void GradTransposeTensor3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5679b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
5689b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5699b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1];
5709b91271bSJeremy L Thompson
571ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5729b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5739b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
5749b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5759b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
5769b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
5779b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
5789b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
5799b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
5809b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
5819b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
5829b91271bSJeremy L Thompson }
5833e2e790dSJeremy L Thompson __syncthreads();
584ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
5859b91271bSJeremy L Thompson }
5869b91271bSJeremy L Thompson
5879b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5889b91271bSJeremy L Thompson // 3D derivatives at quadrature points
5899b91271bSJeremy L Thompson //------------------------------------------------------------------------------
5909b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocated3dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)5919b91271bSJeremy L Thompson inline __device__ void GradTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
5929b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
5939b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
5949b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1];
5959b91271bSJeremy L Thompson
596ce44184cSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
5979b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
5989b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
5999b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
6009b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
6019b91271bSJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
6029b91271bSJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
6039b91271bSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
6049b91271bSJeremy L Thompson }
6053e2e790dSJeremy L Thompson __syncthreads();
606ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
607ce44184cSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
6089b91271bSJeremy L Thompson }
6099b91271bSJeremy L Thompson
6109b91271bSJeremy L Thompson //------------------------------------------------------------------------------
6119b91271bSJeremy L Thompson // 3D derivatives transpose
6129b91271bSJeremy L Thompson //------------------------------------------------------------------------------
6139b91271bSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocated3dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)6149b91271bSJeremy L Thompson inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6159b91271bSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6169b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
6179b91271bSJeremy L Thompson CeedScalar r_t1[1], r_t2[1];
6189b91271bSJeremy L Thompson
619ce44184cSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
6209b91271bSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
6219b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
6229b91271bSJeremy L Thompson ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
6239b91271bSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
6249b91271bSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
6259b91271bSJeremy L Thompson ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
6269b91271bSJeremy L Thompson ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
6279b91271bSJeremy L Thompson }
6283e2e790dSJeremy L Thompson __syncthreads();
629ce44184cSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
6309b91271bSJeremy L Thompson }
6319b91271bSJeremy L Thompson
6329b91271bSJeremy L Thompson //------------------------------------------------------------------------------
6330ccda8ebSJeremy L Thompson // 3D derivatives at quadrature points, nodes and quadrature points collocated
6340ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
6350ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes3dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)6360ccda8ebSJeremy L Thompson inline __device__ void GradTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6370ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6380ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
6390ccda8ebSJeremy L Thompson
6400ccda8ebSJeremy L Thompson if (P_1D != T_1D) QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
6410ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
6420ccda8ebSJeremy L Thompson ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
6430ccda8ebSJeremy L Thompson ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
6440ccda8ebSJeremy L Thompson ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U[comp], c_G, &r_V[comp + 2 * NUM_COMP]);
6450ccda8ebSJeremy L Thompson }
6460ccda8ebSJeremy L Thompson __syncthreads();
6470ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
6480ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
6490ccda8ebSJeremy L Thompson }
6500ccda8ebSJeremy L Thompson
6510ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
6520ccda8ebSJeremy L Thompson // 3D derivatives transpose, nodes and quadrature points collocated
6530ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
6540ccda8ebSJeremy L Thompson template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip & data,CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)6550ccda8ebSJeremy L Thompson inline __device__ void GradTransposeTensorCollocatedNodes3dFlattened(SharedData_Hip &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
6560ccda8ebSJeremy L Thompson const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
6570ccda8ebSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
6580ccda8ebSJeremy L Thompson
6590ccda8ebSJeremy L Thompson if (Q_1D != T_1D) QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
6600ccda8ebSJeremy L Thompson for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
6610ccda8ebSJeremy L Thompson ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, &r_V[comp]);
6620ccda8ebSJeremy L Thompson ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
6630ccda8ebSJeremy L Thompson ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
6640ccda8ebSJeremy L Thompson }
6650ccda8ebSJeremy L Thompson __syncthreads();
6660ccda8ebSJeremy L Thompson if (P_1D != T_1D) QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
6670ccda8ebSJeremy L Thompson }
6680ccda8ebSJeremy L Thompson
6690ccda8ebSJeremy L Thompson //------------------------------------------------------------------------------
6709b91271bSJeremy L Thompson // 3D quadrature weights
6719b91271bSJeremy L Thompson //------------------------------------------------------------------------------
6729b91271bSJeremy L Thompson template <int P_1D, int Q_1D>
WeightTensor3dFlattened(SharedData_Hip & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)6739b91271bSJeremy L Thompson inline __device__ void WeightTensor3dFlattened(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
6749b91271bSJeremy L Thompson const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
6759b91271bSJeremy L Thompson
6769b91271bSJeremy L Thompson *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
6779b91271bSJeremy L Thompson }
678