jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h

*c8e372f0SJeremy L Thompson// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
*c8e372f0SJeremy L Thompson// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
*c8e372f0SJeremy L Thompson//
*c8e372f0SJeremy L Thompson// SPDX-License-Identifier: BSD-2-Clause
*c8e372f0SJeremy L Thompson//
*c8e372f0SJeremy L Thompson// This file is part of CEED:  http://github.com/ceed
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson/// @file
*c8e372f0SJeremy L Thompson/// Internal header for CUDA shared memory tensor product basis templates
*c8e372f0SJeremy L Thompson#include <ceed/types.h>
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D tensor contraction x
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
*c8e372f0SJeremy L Thompson                                            CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  *V = 0.0;
*c8e372f0SJeremy L Thompson  if (t_id_x < Q_1D && t_id_y < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < P_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D tensor contract y
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
*c8e372f0SJeremy L Thompson                                            CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  *V = 0.0;
*c8e372f0SJeremy L Thompson  if (t_id_x < Q_1D && t_id_y < Q_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < P_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D transpose tensor contract y
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
*c8e372f0SJeremy L Thompson                                                     const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  *V = 0.0;
*c8e372f0SJeremy L Thompson  if (t_id_x < Q_1D && t_id_y < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D];  // Contract y direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D transpose tensor contract x
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
*c8e372f0SJeremy L Thompson                                                     const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  *V = 0.0;
*c8e372f0SJeremy L Thompson  if (t_id_x < P_1D && t_id_y < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D transpose tensor contract and add x
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
*c8e372f0SJeremy L Thompson                                                        const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  if (t_id_x < P_1D && t_id_y < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D];  // Contract x direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D pack/unpack quadrature values
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
*c8e372f0SJeremy L Thompson  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
*c8e372f0SJeremy L Thompson    __syncthreads();
*c8e372f0SJeremy L Thompson    U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
*c8e372f0SJeremy L Thompson    __syncthreads();
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
*c8e372f0SJeremy L Thompson  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
*c8e372f0SJeremy L Thompson    __syncthreads();
*c8e372f0SJeremy L Thompson    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
*c8e372f0SJeremy L Thompson    __syncthreads();
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D interpolate to quadrature points
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*c8e372f0SJeremy L Thompson                                               CeedScalar *__restrict__ r_V) {
*c8e372f0SJeremy L Thompson  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
*c8e372f0SJeremy L Thompson  CeedScalar r_t[1];
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
*c8e372f0SJeremy L Thompson    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  QPack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D interpolate transpose
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*c8e372f0SJeremy L Thompson                                                        CeedScalar *__restrict__ r_V) {
*c8e372f0SJeremy L Thompson  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
*c8e372f0SJeremy L Thompson  CeedScalar r_t[1];
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  QUnpack2d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
*c8e372f0SJeremy L Thompson    ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D derivatives at quadrature points
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
*c8e372f0SJeremy L Thompson                                             CeedScalar *__restrict__ r_V) {
*c8e372f0SJeremy L Thompson  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
*c8e372f0SJeremy L Thompson  CeedScalar r_t[1];
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  QUnpack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_U);
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_G, r_t);
*c8e372f0SJeremy L Thompson    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
*c8e372f0SJeremy L Thompson    ContractX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp], c_B, r_t);
*c8e372f0SJeremy L Thompson    ContractY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  QPack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_V);
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D derivatives transpose
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*c8e372f0SJeremy L Thompson                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
*c8e372f0SJeremy L Thompson  const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
*c8e372f0SJeremy L Thompson  CeedScalar r_t[1];
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  QUnpack2d<NUM_COMP * 2, Q_1D, T_1D>(data, t_id_x, t_id_y, r_U);
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
*c8e372f0SJeremy L Thompson    ContractTransposeX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_G, &r_V[comp]);
*c8e372f0SJeremy L Thompson    ContractTransposeY2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
*c8e372f0SJeremy L Thompson    ContractTransposeAddX2dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, r_t, c_B, &r_V[comp]);
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  QPack2d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, r_V);
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 2D quadrature weights
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int P_1D, int Q_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
*c8e372f0SJeremy L Thompson  const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D tensor contract x
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
*c8e372f0SJeremy L Thompson                                            const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  *V = 0.0;
*c8e372f0SJeremy L Thompson  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < P_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D tensor contract y
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
*c8e372f0SJeremy L Thompson                                            const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  *V = 0.0;
*c8e372f0SJeremy L Thompson  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < P_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D tensor contract z
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
*c8e372f0SJeremy L Thompson                                            const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  *V = 0.0;
*c8e372f0SJeremy L Thompson  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < P_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D tensor contract z
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
*c8e372f0SJeremy L Thompson                                                     const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  *V = 0.0;
*c8e372f0SJeremy L Thompson  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D transpose tensor contract z
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
*c8e372f0SJeremy L Thompson                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D];  // Contract z direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D transpose tensor contract y
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
*c8e372f0SJeremy L Thompson                                                     const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  *V = 0.0;
*c8e372f0SJeremy L Thompson  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D transpose tensor contract y
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
*c8e372f0SJeremy L Thompson                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D];  // Contract y direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D transpose tensor contract x
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
*c8e372f0SJeremy L Thompson                                                     const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  *V = 0.0;
*c8e372f0SJeremy L Thompson  if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D transpose tensor contract add x
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
*c8e372f0SJeremy L Thompson                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*c8e372f0SJeremy L Thompson  data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson  if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
*c8e372f0SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*c8e372f0SJeremy L Thompson      *V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D];  // Contract x direction
*c8e372f0SJeremy L Thompson    }
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  __syncthreads();
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D pack/unpack quadrature values
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
*c8e372f0SJeremy L Thompson  const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
*c8e372f0SJeremy L Thompson    __syncthreads();
*c8e372f0SJeremy L Thompson    U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
*c8e372f0SJeremy L Thompson    __syncthreads();
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
*c8e372f0SJeremy L Thompson  const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
*c8e372f0SJeremy L Thompson    __syncthreads();
*c8e372f0SJeremy L Thompson    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
*c8e372f0SJeremy L Thompson    __syncthreads();
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D interpolate to quadrature points
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*c8e372f0SJeremy L Thompson                                               CeedScalar *__restrict__ r_V) {
*c8e372f0SJeremy L Thompson  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
*c8e372f0SJeremy L Thompson  CeedScalar    r_t1[1], r_t2[1];
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
*c8e372f0SJeremy L Thompson    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
*c8e372f0SJeremy L Thompson    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  QPack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D interpolate transpose
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*c8e372f0SJeremy L Thompson                                                        CeedScalar *__restrict__ r_V) {
*c8e372f0SJeremy L Thompson  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
*c8e372f0SJeremy L Thompson  CeedScalar    r_t1[1], r_t2[1];
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  QUnpack3d<NUM_COMP, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
*c8e372f0SJeremy L Thompson    ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
*c8e372f0SJeremy L Thompson    ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D derivatives at quadrature points
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
*c8e372f0SJeremy L Thompson                                             CeedScalar *__restrict__ r_V) {
*c8e372f0SJeremy L Thompson  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
*c8e372f0SJeremy L Thompson  CeedScalar    r_t1[1], r_t2[1];
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_G, r_t1);
*c8e372f0SJeremy L Thompson    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
*c8e372f0SJeremy L Thompson    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 0 * NUM_COMP]);
*c8e372f0SJeremy L Thompson    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
*c8e372f0SJeremy L Thompson    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, r_t2);
*c8e372f0SJeremy L Thompson    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp + 1 * NUM_COMP]);
*c8e372f0SJeremy L Thompson    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
*c8e372f0SJeremy L Thompson    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
*c8e372f0SJeremy L Thompson    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_G, &r_V[comp + 2 * NUM_COMP]);
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D derivatives transpose
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*c8e372f0SJeremy L Thompson                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
*c8e372f0SJeremy L Thompson  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
*c8e372f0SJeremy L Thompson  CeedScalar    r_t1[1], r_t2[1];
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t1);
*c8e372f0SJeremy L Thompson    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
*c8e372f0SJeremy L Thompson    ContractTransposeX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp]);
*c8e372f0SJeremy L Thompson    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_B, r_t1);
*c8e372f0SJeremy L Thompson    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
*c8e372f0SJeremy L Thompson    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
*c8e372f0SJeremy L Thompson    ContractTransposeZ3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, &r_U[comp + 2 * NUM_COMP], c_G, r_t1);
*c8e372f0SJeremy L Thompson    ContractTransposeY3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
*c8e372f0SJeremy L Thompson    ContractTransposeAddX3dFlattened<NUM_COMP, t_id_x, t_id_y, t_id_z, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp]);
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D derivatives at quadrature points
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*c8e372f0SJeremy L Thompson                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
*c8e372f0SJeremy L Thompson  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
*c8e372f0SJeremy L Thompson  CeedScalar    r_t1[1], r_t2[1];
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  QUnpack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    ContractX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp], c_B, r_t1);
*c8e372f0SJeremy L Thompson    ContractY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
*c8e372f0SJeremy L Thompson    ContractZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
*c8e372f0SJeremy L Thompson    ContractX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 0 * NUM_COMP]);
*c8e372f0SJeremy L Thompson    ContractY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 1 * NUM_COMP]);
*c8e372f0SJeremy L Thompson    ContractZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_G, &r_V[comp + 2 * NUM_COMP]);
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  QPack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D derivatives transpose
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*c8e372f0SJeremy L Thompson                                                                const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
*c8e372f0SJeremy L Thompson  const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x % (T_1D * T_1D)) / T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
*c8e372f0SJeremy L Thompson  CeedScalar    r_t1[1], r_t2[1];
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  QUnpack3d<NUM_COMP * 3, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_U);
*c8e372f0SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*c8e372f0SJeremy L Thompson    ContractTransposeZ3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 2 * NUM_COMP], c_G, r_t2);
*c8e372f0SJeremy L Thompson    ContractTransposeAddY3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 1 * NUM_COMP], c_G, r_t2);
*c8e372f0SJeremy L Thompson    ContractTransposeAddX3dFlattened<NUM_COMP, Q_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, &r_U[comp + 0 * NUM_COMP], c_G, r_t2);
*c8e372f0SJeremy L Thompson    ContractTransposeZ3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, r_t1);
*c8e372f0SJeremy L Thompson    ContractTransposeY3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t1, c_B, r_t2);
*c8e372f0SJeremy L Thompson    ContractTransposeX3dFlattened<NUM_COMP, P_1D, Q_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_t2, c_B, &r_V[comp]);
*c8e372f0SJeremy L Thompson  }
*c8e372f0SJeremy L Thompson  QPack3d<NUM_COMP, P_1D, T_1D>(data, t_id_x, t_id_y, t_id_z, r_V);
*c8e372f0SJeremy L Thompson}
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompson// 3D quadrature weights
*c8e372f0SJeremy L Thompson//------------------------------------------------------------------------------
*c8e372f0SJeremy L Thompsontemplate <int P_1D, int Q_1D>
*c8e372f0SJeremy L Thompsoninline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
*c8e372f0SJeremy L Thompson  const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x % (Q_1D * Q_1D)) / Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
*c8e372f0SJeremy L Thompson
*c8e372f0SJeremy L Thompson  *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
*c8e372f0SJeremy L Thompson}