jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h

*9e1d4b82SJeremy L Thompson// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
*9e1d4b82SJeremy L Thompson// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
*9e1d4b82SJeremy L Thompson//
*9e1d4b82SJeremy L Thompson// SPDX-License-Identifier: BSD-2-Clause
*9e1d4b82SJeremy L Thompson//
*9e1d4b82SJeremy L Thompson// This file is part of CEED:  http://github.com/ceed
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson/// @file
*9e1d4b82SJeremy L Thompson/// Internal header for CUDA shared memory tensor product basis AtPoints templates
*9e1d4b82SJeremy L Thompson#include <ceed/types.h>
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// Chebyshev values
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
*9e1d4b82SJeremy L Thompson  chebyshev_x[0] = 1.0;
*9e1d4b82SJeremy L Thompson  chebyshev_x[1] = 2 * x;
*9e1d4b82SJeremy L Thompson  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompsontemplate <int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
*9e1d4b82SJeremy L Thompson  CeedScalar chebyshev_x[3];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson  chebyshev_x[1]  = 1.0;
*9e1d4b82SJeremy L Thompson  chebyshev_x[2]  = 2 * x;
*9e1d4b82SJeremy L Thompson  chebyshev_dx[0] = 0.0;
*9e1d4b82SJeremy L Thompson  chebyshev_dx[1] = 2.0;
*9e1d4b82SJeremy L Thompson  for (CeedInt i = 2; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
*9e1d4b82SJeremy L Thompson    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 1D
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 1D interpolate to points
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                        CeedScalar *__restrict__ r_V) {
*9e1d4b82SJeremy L Thompson  CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
*9e1d4b82SJeremy L Thompson  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    // Load coefficients
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    // Contract x direction
*9e1d4b82SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson      r_V[comp] += chebyshev_x[i] * data.slice[i];
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 1D interpolate transpose
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                                 CeedScalar *__restrict__ r_C) {
*9e1d4b82SJeremy L Thompson  CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    // Clear shared memory
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    // Contract x direction
*9e1d4b82SJeremy L Thompson    if (p < NUM_POINTS) {
*9e1d4b82SJeremy L Thompson      for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson    // Pull from shared to register
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 1D derivatives at points
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                      CeedScalar *__restrict__ r_V) {
*9e1d4b82SJeremy L Thompson  CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    // Load coefficients
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    // Contract x direction
*9e1d4b82SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson      r_V[comp] += chebyshev_x[i] * data.slice[i];
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 1D derivatives transpose
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                               CeedScalar *__restrict__ r_C) {
*9e1d4b82SJeremy L Thompson  CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    // Clear shared memory
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    // Contract x direction
*9e1d4b82SJeremy L Thompson    if (p < NUM_POINTS) {
*9e1d4b82SJeremy L Thompson      for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson    // Pull from shared to register
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 2D
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 2D interpolate to points
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                        CeedScalar *__restrict__ r_V) {
*9e1d4b82SJeremy L Thompson  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    CeedScalar buffer[Q_1D];
*9e1d4b82SJeremy L Thompson    CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson    // Load coefficients
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    // Contract x direction
*9e1d4b82SJeremy L Thompson    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson      buffer[i] = 0.0;
*9e1d4b82SJeremy L Thompson      for (CeedInt j = 0; j < Q_1D; j++) {
*9e1d4b82SJeremy L Thompson        buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson    // Contract y direction
*9e1d4b82SJeremy L Thompson    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson      r_V[comp] += chebyshev_x[i] * buffer[i];
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 2D interpolate transpose
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                                 CeedScalar *__restrict__ r_C) {
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    CeedScalar buffer[Q_1D];
*9e1d4b82SJeremy L Thompson    CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson    // Clear shared memory
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    // Contract y direction
*9e1d4b82SJeremy L Thompson    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson    for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson      buffer[i] = chebyshev_x[i] * r_U[comp];
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson    // Contract x direction
*9e1d4b82SJeremy L Thompson    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson    if (p < NUM_POINTS) {
*9e1d4b82SJeremy L Thompson      for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson        // Note: shifting to avoid atomic adds
*9e1d4b82SJeremy L Thompson        const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson        for (CeedInt j = 0; j < Q_1D; j++) {
*9e1d4b82SJeremy L Thompson          const CeedInt jj = (j + p) % Q_1D;
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson          atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
*9e1d4b82SJeremy L Thompson        }
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson    // Pull from shared to register
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 2D derivatives at points
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                      CeedScalar *__restrict__ r_V) {
*9e1d4b82SJeremy L Thompson  for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    CeedScalar buffer[Q_1D];
*9e1d4b82SJeremy L Thompson    CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson    // Load coefficients
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    for (CeedInt dim = 0; dim < 2; dim++) {
*9e1d4b82SJeremy L Thompson      // Contract x direction
*9e1d4b82SJeremy L Thompson      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson      for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson        buffer[i] = 0.0;
*9e1d4b82SJeremy L Thompson        for (CeedInt j = 0; j < Q_1D; j++) {
*9e1d4b82SJeremy L Thompson          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
*9e1d4b82SJeremy L Thompson        }
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson      // Contract y direction
*9e1d4b82SJeremy L Thompson      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson      for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson        r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 2D derivatives transpose
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                               CeedScalar *__restrict__ r_C) {
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    CeedScalar buffer[Q_1D];
*9e1d4b82SJeremy L Thompson    CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson    // Clear shared memory
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    for (CeedInt dim = 0; dim < 2; dim++) {
*9e1d4b82SJeremy L Thompson      // Contract y direction
*9e1d4b82SJeremy L Thompson      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson      for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson        buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson      // Contract x direction
*9e1d4b82SJeremy L Thompson      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson      if (p < NUM_POINTS) {
*9e1d4b82SJeremy L Thompson        for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson          // Note: shifting to avoid atomic adds
*9e1d4b82SJeremy L Thompson          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson          for (CeedInt j = 0; j < Q_1D; j++) {
*9e1d4b82SJeremy L Thompson            const CeedInt jj = (j + p) % Q_1D;
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
*9e1d4b82SJeremy L Thompson          }
*9e1d4b82SJeremy L Thompson        }
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson    // Pull from shared to register
*9e1d4b82SJeremy L Thompson    __syncthreads();
*9e1d4b82SJeremy L Thompson    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 3D
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 3D interpolate to points
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                        CeedScalar *__restrict__ r_V) {
*9e1d4b82SJeremy L Thompson  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    for (CeedInt k = 0; k < Q_1D; k++) {
*9e1d4b82SJeremy L Thompson      CeedScalar buffer[Q_1D];
*9e1d4b82SJeremy L Thompson      CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson      // Load coefficients
*9e1d4b82SJeremy L Thompson      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
*9e1d4b82SJeremy L Thompson      __syncthreads();
*9e1d4b82SJeremy L Thompson      // Contract x direction
*9e1d4b82SJeremy L Thompson      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson      for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson        buffer[i] = 0.0;
*9e1d4b82SJeremy L Thompson        for (CeedInt j = 0; j < Q_1D; j++) {
*9e1d4b82SJeremy L Thompson          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
*9e1d4b82SJeremy L Thompson        }
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson      // Contract y and z direction
*9e1d4b82SJeremy L Thompson      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
*9e1d4b82SJeremy L Thompson      const CeedScalar z = chebyshev_x[k];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson      for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson        r_V[comp] += chebyshev_x[i] * buffer[i] * z;
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 3D interpolate transpose
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                                 CeedScalar *__restrict__ r_C) {
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    for (CeedInt k = 0; k < Q_1D; k++) {
*9e1d4b82SJeremy L Thompson      CeedScalar buffer[Q_1D];
*9e1d4b82SJeremy L Thompson      CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson      // Clear shared memory
*9e1d4b82SJeremy L Thompson      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
*9e1d4b82SJeremy L Thompson      __syncthreads();
*9e1d4b82SJeremy L Thompson      // Contract y and z direction
*9e1d4b82SJeremy L Thompson      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
*9e1d4b82SJeremy L Thompson      const CeedScalar z = chebyshev_x[k];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson      for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson        buffer[i] = chebyshev_x[i] * r_U[comp] * z;
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson      // Contract x direction
*9e1d4b82SJeremy L Thompson      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson      if (p < NUM_POINTS) {
*9e1d4b82SJeremy L Thompson        for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson          // Note: shifting to avoid atomic adds
*9e1d4b82SJeremy L Thompson          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson          for (CeedInt j = 0; j < Q_1D; j++) {
*9e1d4b82SJeremy L Thompson            const CeedInt jj = ((j + p) % Q_1D);
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
*9e1d4b82SJeremy L Thompson          }
*9e1d4b82SJeremy L Thompson        }
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson      // Pull from shared to register
*9e1d4b82SJeremy L Thompson      __syncthreads();
*9e1d4b82SJeremy L Thompson      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 3D derivatives at points
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                      CeedScalar *__restrict__ r_V) {
*9e1d4b82SJeremy L Thompson  for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    for (CeedInt k = 0; k < Q_1D; k++) {
*9e1d4b82SJeremy L Thompson      CeedScalar buffer[Q_1D];
*9e1d4b82SJeremy L Thompson      CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson      // Load coefficients
*9e1d4b82SJeremy L Thompson      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
*9e1d4b82SJeremy L Thompson      __syncthreads();
*9e1d4b82SJeremy L Thompson      for (CeedInt dim = 0; dim < 3; dim++) {
*9e1d4b82SJeremy L Thompson        // Contract x direction
*9e1d4b82SJeremy L Thompson        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson        for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson          buffer[i] = 0.0;
*9e1d4b82SJeremy L Thompson          for (CeedInt j = 0; j < Q_1D; j++) {
*9e1d4b82SJeremy L Thompson            buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
*9e1d4b82SJeremy L Thompson          }
*9e1d4b82SJeremy L Thompson        }
*9e1d4b82SJeremy L Thompson        // Contract y and z direction
*9e1d4b82SJeremy L Thompson        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
*9e1d4b82SJeremy L Thompson        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
*9e1d4b82SJeremy L Thompson        const CeedScalar z = chebyshev_x[k];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson        for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
*9e1d4b82SJeremy L Thompson        }
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompson// 3D derivatives transpose
*9e1d4b82SJeremy L Thompson//------------------------------------------------------------------------------
*9e1d4b82SJeremy L Thompsontemplate <int NUM_COMP, int NUM_POINTS, int Q_1D>
*9e1d4b82SJeremy L Thompsoninline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
*9e1d4b82SJeremy L Thompson                                               CeedScalar *__restrict__ r_C) {
*9e1d4b82SJeremy L Thompson  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
*9e1d4b82SJeremy L Thompson    for (CeedInt k = 0; k < Q_1D; k++) {
*9e1d4b82SJeremy L Thompson      CeedScalar buffer[Q_1D];
*9e1d4b82SJeremy L Thompson      CeedScalar chebyshev_x[Q_1D];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson      // Clear shared memory
*9e1d4b82SJeremy L Thompson      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
*9e1d4b82SJeremy L Thompson      __syncthreads();
*9e1d4b82SJeremy L Thompson      for (CeedInt dim = 0; dim < 3; dim++) {
*9e1d4b82SJeremy L Thompson        // Contract y and z direction
*9e1d4b82SJeremy L Thompson        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
*9e1d4b82SJeremy L Thompson        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
*9e1d4b82SJeremy L Thompson        const CeedScalar z = chebyshev_x[k];
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
*9e1d4b82SJeremy L Thompson        for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
*9e1d4b82SJeremy L Thompson        }
*9e1d4b82SJeremy L Thompson        // Contract x direction
*9e1d4b82SJeremy L Thompson        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
*9e1d4b82SJeremy L Thompson        if (p < NUM_POINTS) {
*9e1d4b82SJeremy L Thompson          for (CeedInt i = 0; i < Q_1D; i++) {
*9e1d4b82SJeremy L Thompson            // Note: shifting to avoid atomic adds
*9e1d4b82SJeremy L Thompson            const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson            for (CeedInt j = 0; j < Q_1D; j++) {
*9e1d4b82SJeremy L Thompson              const CeedInt jj = ((j + p) % Q_1D);
*9e1d4b82SJeremy L Thompson
*9e1d4b82SJeremy L Thompson              atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
*9e1d4b82SJeremy L Thompson            }
*9e1d4b82SJeremy L Thompson          }
*9e1d4b82SJeremy L Thompson        }
*9e1d4b82SJeremy L Thompson      }
*9e1d4b82SJeremy L Thompson      // Pull from shared to register
*9e1d4b82SJeremy L Thompson      __syncthreads();
*9e1d4b82SJeremy L Thompson      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
*9e1d4b82SJeremy L Thompson    }
*9e1d4b82SJeremy L Thompson  }
*9e1d4b82SJeremy L Thompson}