jit-source/hip/hip-ref-basis-tensor-at-points.h

1c21e869SJeremy L Thompson// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
1c21e869SJeremy L Thompson// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
1c21e869SJeremy L Thompson//
1c21e869SJeremy L Thompson// SPDX-License-Identifier: BSD-2-Clause
1c21e869SJeremy L Thompson//
1c21e869SJeremy L Thompson// This file is part of CEED:  http://github.com/ceed
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson/// @file
1c21e869SJeremy L Thompson/// Internal header for CUDA tensor product basis with AtPoints evaluation
c0b5abf0SJeremy L Thompson#include <ceed/types.h>
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson//------------------------------------------------------------------------------
1c21e869SJeremy L Thompson// Chebyshev values
1c21e869SJeremy L Thompson//------------------------------------------------------------------------------
1c21e869SJeremy L Thompsontemplate <int Q_1D>
1c21e869SJeremy L Thompsoninline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
1c21e869SJeremy L Thompson  chebyshev_x[0] = 1.0;
1c21e869SJeremy L Thompson  chebyshev_x[1] = 2 * x;
1c21e869SJeremy L Thompson  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
1c21e869SJeremy L Thompson}
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompsontemplate <int Q_1D>
1c21e869SJeremy L Thompsoninline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
1c21e869SJeremy L Thompson  CeedScalar chebyshev_x[3];
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson  chebyshev_x[1]  = 1.0;
1c21e869SJeremy L Thompson  chebyshev_x[2]  = 2 * x;
1c21e869SJeremy L Thompson  chebyshev_dx[0] = 0.0;
1c21e869SJeremy L Thompson  chebyshev_dx[1] = 2.0;
1c21e869SJeremy L Thompson  for (CeedInt i = 2; i < Q_1D; i++) {
80c135a8SJeremy L Thompson    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
80c135a8SJeremy L Thompson    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
1c21e869SJeremy L Thompson  }
1c21e869SJeremy L Thompson}
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson//------------------------------------------------------------------------------
1c21e869SJeremy L Thompson// Tensor Basis Kernels AtPoints
1c21e869SJeremy L Thompson//------------------------------------------------------------------------------
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson//------------------------------------------------------------------------------
1c21e869SJeremy L Thompson// Interp
1c21e869SJeremy L Thompson//------------------------------------------------------------------------------
*81ae6159SJeremy L Thompsonextern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
111870feSJeremy L Thompson                                          const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
111870feSJeremy L Thompson                                          const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
1c21e869SJeremy L Thompson  const CeedInt i = threadIdx.x;
1c21e869SJeremy L Thompson
f7c9815fSJeremy L Thompson  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
1c21e869SJeremy L Thompson  CeedScalar           *s_chebyshev_interp_1d = s_mem;
1c21e869SJeremy L Thompson  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
1c21e869SJeremy L Thompson  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
1c21e869SJeremy L Thompson  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
1c21e869SJeremy L Thompson  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
1c21e869SJeremy L Thompson  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
1c21e869SJeremy L Thompson    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
1c21e869SJeremy L Thompson  }
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson  const CeedInt P             = BASIS_P_1D;
1c21e869SJeremy L Thompson  const CeedInt Q             = BASIS_Q_1D;
*81ae6159SJeremy L Thompson  const CeedInt u_stride      = BASIS_NUM_NODES;
*81ae6159SJeremy L Thompson  const CeedInt v_stride      = BASIS_NUM_PTS;
*81ae6159SJeremy L Thompson  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
*81ae6159SJeremy L Thompson  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
*81ae6159SJeremy L Thompson  const CeedInt u_size        = BASIS_NUM_NODES;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson  // Apply basis element by element
*81ae6159SJeremy L Thompson  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
*81ae6159SJeremy L Thompson    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
*81ae6159SJeremy L Thompson      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
*81ae6159SJeremy L Thompson      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
*81ae6159SJeremy L Thompson      CeedInt           pre   = u_size;
*81ae6159SJeremy L Thompson      CeedInt           post  = 1;
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson      // Map to coefficients
*81ae6159SJeremy L Thompson      for (CeedInt d = 0; d < BASIS_DIM; d++) {
*81ae6159SJeremy L Thompson        __syncthreads();
*81ae6159SJeremy L Thompson        // Update buffers used
*81ae6159SJeremy L Thompson        pre /= P;
*81ae6159SJeremy L Thompson        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
*81ae6159SJeremy L Thompson        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
*81ae6159SJeremy L Thompson        const CeedInt     writeLen = pre * post * Q;
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson        // Contract along middle index
*81ae6159SJeremy L Thompson        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
*81ae6159SJeremy L Thompson          const CeedInt c   = k % post;
*81ae6159SJeremy L Thompson          const CeedInt j   = (k / post) % Q;
*81ae6159SJeremy L Thompson          const CeedInt a   = k / (post * Q);
*81ae6159SJeremy L Thompson          CeedScalar    v_k = 0;
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
*81ae6159SJeremy L Thompson          out[k] = v_k;
*81ae6159SJeremy L Thompson        }
*81ae6159SJeremy L Thompson        post *= Q;
*81ae6159SJeremy L Thompson      }
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson      // Map to point
*81ae6159SJeremy L Thompson      __syncthreads();
*81ae6159SJeremy L Thompson      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
*81ae6159SJeremy L Thompson        pre  = BASIS_NUM_QPTS;
*81ae6159SJeremy L Thompson        post = 1;
*81ae6159SJeremy L Thompson        for (CeedInt d = 0; d < BASIS_DIM; d++) {
*81ae6159SJeremy L Thompson          // Update buffers used
*81ae6159SJeremy L Thompson          pre /= Q;
*81ae6159SJeremy L Thompson          const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
*81ae6159SJeremy L Thompson          CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson          // Build Chebyshev polynomial values
*81ae6159SJeremy L Thompson          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson          // Contract along middle index
*81ae6159SJeremy L Thompson          for (CeedInt a = 0; a < pre; a++) {
*81ae6159SJeremy L Thompson            for (CeedInt c = 0; c < post; c++) {
*81ae6159SJeremy L Thompson              CeedScalar v_k = 0;
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson              for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
*81ae6159SJeremy L Thompson              out[a * post + c] = v_k;
*81ae6159SJeremy L Thompson            }
*81ae6159SJeremy L Thompson          }
*81ae6159SJeremy L Thompson          post *= 1;
*81ae6159SJeremy L Thompson        }
*81ae6159SJeremy L Thompson      }
*81ae6159SJeremy L Thompson    }
*81ae6159SJeremy L Thompson  }
*81ae6159SJeremy L Thompson}
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompsonextern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
*81ae6159SJeremy L Thompson                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
*81ae6159SJeremy L Thompson                                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
*81ae6159SJeremy L Thompson  const CeedInt i = threadIdx.x;
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
*81ae6159SJeremy L Thompson  CeedScalar           *s_chebyshev_interp_1d = s_mem;
*81ae6159SJeremy L Thompson  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
*81ae6159SJeremy L Thompson  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
*81ae6159SJeremy L Thompson  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
*81ae6159SJeremy L Thompson  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
*81ae6159SJeremy L Thompson  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
*81ae6159SJeremy L Thompson    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
*81ae6159SJeremy L Thompson  }
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson  const CeedInt P             = BASIS_P_1D;
*81ae6159SJeremy L Thompson  const CeedInt Q             = BASIS_Q_1D;
*81ae6159SJeremy L Thompson  const CeedInt u_stride      = BASIS_NUM_PTS;
*81ae6159SJeremy L Thompson  const CeedInt v_stride      = BASIS_NUM_NODES;
*81ae6159SJeremy L Thompson  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
*81ae6159SJeremy L Thompson  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
*81ae6159SJeremy L Thompson  const CeedInt u_size        = BASIS_NUM_PTS;
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson  // Apply basis element by element
1c21e869SJeremy L Thompson  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
1c21e869SJeremy L Thompson    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
db2becc9SJeremy L Thompson      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
db2becc9SJeremy L Thompson      CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
1c21e869SJeremy L Thompson      CeedInt           pre   = 1;
1c21e869SJeremy L Thompson      CeedInt           post  = 1;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson      // Clear Chebyshev coeffs
1c21e869SJeremy L Thompson      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
1c21e869SJeremy L Thompson        s_chebyshev_coeffs[k] = 0.0;
1c21e869SJeremy L Thompson      }
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson      // Map from point
2d10e82cSJeremy L Thompson      __syncthreads();
2d10e82cSJeremy L Thompson      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
111870feSJeremy L Thompson        if (p >= points_per_elem[elem]) continue;
1c21e869SJeremy L Thompson        pre  = 1;
1c21e869SJeremy L Thompson        post = 1;
1c21e869SJeremy L Thompson        for (CeedInt d = 0; d < BASIS_DIM; d++) {
1c21e869SJeremy L Thompson          // Update buffers used
1c21e869SJeremy L Thompson          pre /= 1;
db2becc9SJeremy L Thompson          const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
1c21e869SJeremy L Thompson          CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson          // Build Chebyshev polynomial values
1c21e869SJeremy L Thompson          ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson          // Contract along middle index
1c21e869SJeremy L Thompson          for (CeedInt a = 0; a < pre; a++) {
1c21e869SJeremy L Thompson            for (CeedInt c = 0; c < post; c++) {
1c21e869SJeremy L Thompson              if (d == BASIS_DIM - 1) {
ad8059fcSJeremy L Thompson                for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
1c21e869SJeremy L Thompson              } else {
1c21e869SJeremy L Thompson                for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
1c21e869SJeremy L Thompson              }
1c21e869SJeremy L Thompson            }
1c21e869SJeremy L Thompson          }
1c21e869SJeremy L Thompson          post *= Q;
1c21e869SJeremy L Thompson        }
1c21e869SJeremy L Thompson      }
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson      // Map from coefficients
1c21e869SJeremy L Thompson      pre  = BASIS_NUM_QPTS;
1c21e869SJeremy L Thompson      post = 1;
1c21e869SJeremy L Thompson      for (CeedInt d = 0; d < BASIS_DIM; d++) {
1c21e869SJeremy L Thompson        __syncthreads();
1c21e869SJeremy L Thompson        // Update buffers used
1c21e869SJeremy L Thompson        pre /= Q;
1c21e869SJeremy L Thompson        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
1c21e869SJeremy L Thompson        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
1c21e869SJeremy L Thompson        const CeedInt     writeLen = pre * post * P;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson        // Contract along middle index
1c21e869SJeremy L Thompson        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
1c21e869SJeremy L Thompson          const CeedInt c   = k % post;
1c21e869SJeremy L Thompson          const CeedInt j   = (k / post) % P;
1c21e869SJeremy L Thompson          const CeedInt a   = k / (post * P);
1c21e869SJeremy L Thompson          CeedScalar    v_k = 0;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
db2becc9SJeremy L Thompson          if (d == BASIS_DIM - 1) out[k] += v_k;
db2becc9SJeremy L Thompson          else out[k] = v_k;
1c21e869SJeremy L Thompson        }
1c21e869SJeremy L Thompson        post *= P;
1c21e869SJeremy L Thompson      }
1c21e869SJeremy L Thompson    }
1c21e869SJeremy L Thompson  }
*81ae6159SJeremy L Thompson}
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson//------------------------------------------------------------------------------
*81ae6159SJeremy L Thompson// Grad
*81ae6159SJeremy L Thompson//------------------------------------------------------------------------------
*81ae6159SJeremy L Thompsonextern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
*81ae6159SJeremy L Thompson                                        const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
*81ae6159SJeremy L Thompson                                        const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
*81ae6159SJeremy L Thompson  const CeedInt i = threadIdx.x;
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
*81ae6159SJeremy L Thompson  CeedScalar           *s_chebyshev_interp_1d = s_mem;
*81ae6159SJeremy L Thompson  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
*81ae6159SJeremy L Thompson  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
*81ae6159SJeremy L Thompson  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
*81ae6159SJeremy L Thompson  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
*81ae6159SJeremy L Thompson  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
*81ae6159SJeremy L Thompson    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
*81ae6159SJeremy L Thompson  }
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson  const CeedInt P             = BASIS_P_1D;
*81ae6159SJeremy L Thompson  const CeedInt Q             = BASIS_Q_1D;
*81ae6159SJeremy L Thompson  const CeedInt u_stride      = BASIS_NUM_NODES;
*81ae6159SJeremy L Thompson  const CeedInt v_stride      = BASIS_NUM_PTS;
*81ae6159SJeremy L Thompson  const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
*81ae6159SJeremy L Thompson  const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
*81ae6159SJeremy L Thompson  const CeedInt u_size        = BASIS_NUM_NODES;
*81ae6159SJeremy L Thompson  const CeedInt u_dim_stride  = 0;
*81ae6159SJeremy L Thompson  const CeedInt v_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
*81ae6159SJeremy L Thompson
*81ae6159SJeremy L Thompson  // Apply basis element by element
1c21e869SJeremy L Thompson  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
1c21e869SJeremy L Thompson    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
db2becc9SJeremy L Thompson      const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
1c21e869SJeremy L Thompson      CeedInt           pre   = u_size;
1c21e869SJeremy L Thompson      CeedInt           post  = 1;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson      // Map to coefficients
1c21e869SJeremy L Thompson      for (CeedInt d = 0; d < BASIS_DIM; d++) {
1c21e869SJeremy L Thompson        __syncthreads();
1c21e869SJeremy L Thompson        // Update buffers used
1c21e869SJeremy L Thompson        pre /= P;
1c21e869SJeremy L Thompson        const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
1c21e869SJeremy L Thompson        CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
1c21e869SJeremy L Thompson        const CeedInt     writeLen = pre * post * Q;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson        // Contract along middle index
1c21e869SJeremy L Thompson        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
1c21e869SJeremy L Thompson          const CeedInt c   = k % post;
1c21e869SJeremy L Thompson          const CeedInt j   = (k / post) % Q;
1c21e869SJeremy L Thompson          const CeedInt a   = k / (post * Q);
1c21e869SJeremy L Thompson          CeedScalar    v_k = 0;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson          for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
1c21e869SJeremy L Thompson          out[k] = v_k;
1c21e869SJeremy L Thompson        }
1c21e869SJeremy L Thompson        post *= Q;
1c21e869SJeremy L Thompson      }
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson      // Map to point
1c21e869SJeremy L Thompson      __syncthreads();
2d10e82cSJeremy L Thompson      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
*81ae6159SJeremy L Thompson        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
*81ae6159SJeremy L Thompson          CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
*81ae6159SJeremy L Thompson
1c21e869SJeremy L Thompson          pre  = BASIS_NUM_QPTS;
1c21e869SJeremy L Thompson          post = 1;
*81ae6159SJeremy L Thompson          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
1c21e869SJeremy L Thompson            // Update buffers used
1c21e869SJeremy L Thompson            pre /= Q;
*81ae6159SJeremy L Thompson            const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
*81ae6159SJeremy L Thompson            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson            // Build Chebyshev polynomial values
*81ae6159SJeremy L Thompson            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
*81ae6159SJeremy L Thompson            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson            // Contract along middle index
1c21e869SJeremy L Thompson            for (CeedInt a = 0; a < pre; a++) {
1c21e869SJeremy L Thompson              for (CeedInt c = 0; c < post; c++) {
1c21e869SJeremy L Thompson                CeedScalar v_k = 0;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson                for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
1c21e869SJeremy L Thompson                out[a * post + c] = v_k;
1c21e869SJeremy L Thompson              }
1c21e869SJeremy L Thompson            }
1c21e869SJeremy L Thompson            post *= 1;
1c21e869SJeremy L Thompson          }
1c21e869SJeremy L Thompson        }
1c21e869SJeremy L Thompson      }
1c21e869SJeremy L Thompson    }
1c21e869SJeremy L Thompson  }
1c21e869SJeremy L Thompson}
1c21e869SJeremy L Thompson
*81ae6159SJeremy L Thompsonextern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
111870feSJeremy L Thompson                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
111870feSJeremy L Thompson                                                 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
1c21e869SJeremy L Thompson  const CeedInt i = threadIdx.x;
1c21e869SJeremy L Thompson
f7c9815fSJeremy L Thompson  __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
1c21e869SJeremy L Thompson  CeedScalar           *s_chebyshev_interp_1d = s_mem;
1c21e869SJeremy L Thompson  CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
1c21e869SJeremy L Thompson  CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
1c21e869SJeremy L Thompson  CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
1c21e869SJeremy L Thompson  CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
1c21e869SJeremy L Thompson  for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
1c21e869SJeremy L Thompson    s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
1c21e869SJeremy L Thompson  }
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson  const CeedInt P             = BASIS_P_1D;
1c21e869SJeremy L Thompson  const CeedInt Q             = BASIS_Q_1D;
*81ae6159SJeremy L Thompson  const CeedInt u_stride      = BASIS_NUM_PTS;
*81ae6159SJeremy L Thompson  const CeedInt v_stride      = BASIS_NUM_NODES;
*81ae6159SJeremy L Thompson  const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
*81ae6159SJeremy L Thompson  const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
*81ae6159SJeremy L Thompson  const CeedInt u_size        = BASIS_NUM_PTS;
*81ae6159SJeremy L Thompson  const CeedInt u_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
*81ae6159SJeremy L Thompson  const CeedInt v_dim_stride  = 0;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson  // Apply basis element by element
1c21e869SJeremy L Thompson  for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
1c21e869SJeremy L Thompson    for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
db2becc9SJeremy L Thompson      CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
1c21e869SJeremy L Thompson      CeedInt     pre   = 1;
1c21e869SJeremy L Thompson      CeedInt     post  = 1;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson      // Clear Chebyshev coeffs
1c21e869SJeremy L Thompson      for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
1c21e869SJeremy L Thompson        s_chebyshev_coeffs[k] = 0.0;
1c21e869SJeremy L Thompson      }
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson      // Map from point
2d10e82cSJeremy L Thompson      __syncthreads();
2d10e82cSJeremy L Thompson      for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
111870feSJeremy L Thompson        if (p >= points_per_elem[elem]) continue;
1c21e869SJeremy L Thompson        for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
db2becc9SJeremy L Thompson          const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson          pre  = 1;
1c21e869SJeremy L Thompson          post = 1;
1c21e869SJeremy L Thompson          for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
1c21e869SJeremy L Thompson            // Update buffers used
1c21e869SJeremy L Thompson            pre /= 1;
db2becc9SJeremy L Thompson            const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
1c21e869SJeremy L Thompson            CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson            // Build Chebyshev polynomial values
1c21e869SJeremy L Thompson            if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
1c21e869SJeremy L Thompson            else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson            // Contract along middle index
1c21e869SJeremy L Thompson            for (CeedInt a = 0; a < pre; a++) {
1c21e869SJeremy L Thompson              for (CeedInt c = 0; c < post; c++) {
1c21e869SJeremy L Thompson                if (dim_2 == BASIS_DIM - 1) {
ad8059fcSJeremy L Thompson                  for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
1c21e869SJeremy L Thompson                } else {
1c21e869SJeremy L Thompson                  for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
1c21e869SJeremy L Thompson                }
1c21e869SJeremy L Thompson              }
1c21e869SJeremy L Thompson            }
1c21e869SJeremy L Thompson            post *= Q;
1c21e869SJeremy L Thompson          }
1c21e869SJeremy L Thompson        }
1c21e869SJeremy L Thompson      }
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson      // Map from coefficients
1c21e869SJeremy L Thompson      pre  = BASIS_NUM_QPTS;
1c21e869SJeremy L Thompson      post = 1;
1c21e869SJeremy L Thompson      for (CeedInt d = 0; d < BASIS_DIM; d++) {
1c21e869SJeremy L Thompson        __syncthreads();
1c21e869SJeremy L Thompson        // Update buffers used
1c21e869SJeremy L Thompson        pre /= Q;
1c21e869SJeremy L Thompson        const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
1c21e869SJeremy L Thompson        CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
1c21e869SJeremy L Thompson        const CeedInt     writeLen = pre * post * P;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson        // Contract along middle index
1c21e869SJeremy L Thompson        for (CeedInt k = i; k < writeLen; k += blockDim.x) {
1c21e869SJeremy L Thompson          const CeedInt c   = k % post;
1c21e869SJeremy L Thompson          const CeedInt j   = (k / post) % P;
1c21e869SJeremy L Thompson          const CeedInt a   = k / (post * P);
1c21e869SJeremy L Thompson          CeedScalar    v_k = 0;
1c21e869SJeremy L Thompson
1c21e869SJeremy L Thompson          for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
db2becc9SJeremy L Thompson          if (d == BASIS_DIM - 1) out[k] += v_k;
db2becc9SJeremy L Thompson          else out[k] = v_k;
1c21e869SJeremy L Thompson        }
1c21e869SJeremy L Thompson        post *= P;
1c21e869SJeremy L Thompson      }
1c21e869SJeremy L Thompson    }
1c21e869SJeremy L Thompson  }
1c21e869SJeremy L Thompson}