xref: /libCEED/include/ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h (revision d4cc18453651bd0f94c1a2e078b2646a92dafdcc)
1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
21c21e869SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
31c21e869SJeremy L Thompson //
41c21e869SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
51c21e869SJeremy L Thompson //
61c21e869SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
71c21e869SJeremy L Thompson 
81c21e869SJeremy L Thompson /// @file
91c21e869SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
111c21e869SJeremy L Thompson 
121c21e869SJeremy L Thompson //------------------------------------------------------------------------------
131c21e869SJeremy L Thompson // Chebyshev values
141c21e869SJeremy L Thompson //------------------------------------------------------------------------------
151c21e869SJeremy L Thompson template <int Q_1D>
ChebyshevPolynomialsAtPoint(const CeedScalar x,CeedScalar * chebyshev_x)161c21e869SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
171c21e869SJeremy L Thompson   chebyshev_x[0] = 1.0;
181c21e869SJeremy L Thompson   chebyshev_x[1] = 2 * x;
191c21e869SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
201c21e869SJeremy L Thompson }
211c21e869SJeremy L Thompson 
221c21e869SJeremy L Thompson template <int Q_1D>
ChebyshevDerivativeAtPoint(const CeedScalar x,CeedScalar * chebyshev_dx)231c21e869SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
241c21e869SJeremy L Thompson   CeedScalar chebyshev_x[3];
251c21e869SJeremy L Thompson 
261c21e869SJeremy L Thompson   chebyshev_x[1]  = 1.0;
271c21e869SJeremy L Thompson   chebyshev_x[2]  = 2 * x;
281c21e869SJeremy L Thompson   chebyshev_dx[0] = 0.0;
291c21e869SJeremy L Thompson   chebyshev_dx[1] = 2.0;
301c21e869SJeremy L Thompson   for (CeedInt i = 2; i < Q_1D; i++) {
3180c135a8SJeremy L Thompson     chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
3280c135a8SJeremy L Thompson     chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
331c21e869SJeremy L Thompson   }
341c21e869SJeremy L Thompson }
351c21e869SJeremy L Thompson 
361c21e869SJeremy L Thompson //------------------------------------------------------------------------------
371c21e869SJeremy L Thompson // Tensor Basis Kernels AtPoints
381c21e869SJeremy L Thompson //------------------------------------------------------------------------------
391c21e869SJeremy L Thompson 
401c21e869SJeremy L Thompson //------------------------------------------------------------------------------
411c21e869SJeremy L Thompson // Interp
421c21e869SJeremy L Thompson //------------------------------------------------------------------------------
InterpAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ chebyshev_interp_1d,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ coords,const CeedScalar * __restrict__ u,CeedScalar * __restrict__ v)4381ae6159SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
44111870feSJeremy L Thompson                                           const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
45111870feSJeremy L Thompson                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
461c21e869SJeremy L Thompson   const CeedInt i = threadIdx.x;
471c21e869SJeremy L Thompson 
48f7c9815fSJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
491c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
501c21e869SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
511c21e869SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
521c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
531c21e869SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
541c21e869SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
551c21e869SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
561c21e869SJeremy L Thompson   }
571c21e869SJeremy L Thompson 
581c21e869SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
591c21e869SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
6081ae6159SJeremy L Thompson   const CeedInt u_stride      = BASIS_NUM_NODES;
6181ae6159SJeremy L Thompson   const CeedInt v_stride      = BASIS_NUM_PTS;
6281ae6159SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
6381ae6159SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
6481ae6159SJeremy L Thompson   const CeedInt u_size        = BASIS_NUM_NODES;
651c21e869SJeremy L Thompson 
661c21e869SJeremy L Thompson   // Apply basis element by element
6781ae6159SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
6881ae6159SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
6981ae6159SJeremy L Thompson       const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
7081ae6159SJeremy L Thompson       CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
7181ae6159SJeremy L Thompson       CeedInt           pre   = u_size;
7281ae6159SJeremy L Thompson       CeedInt           post  = 1;
7381ae6159SJeremy L Thompson 
7481ae6159SJeremy L Thompson       // Map to coefficients
7581ae6159SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
7681ae6159SJeremy L Thompson         __syncthreads();
7781ae6159SJeremy L Thompson         // Update buffers used
7881ae6159SJeremy L Thompson         pre /= P;
7981ae6159SJeremy L Thompson         const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
8081ae6159SJeremy L Thompson         CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
8181ae6159SJeremy L Thompson         const CeedInt     writeLen = pre * post * Q;
8281ae6159SJeremy L Thompson 
8381ae6159SJeremy L Thompson         // Contract along middle index
8481ae6159SJeremy L Thompson         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
8581ae6159SJeremy L Thompson           const CeedInt c   = k % post;
8681ae6159SJeremy L Thompson           const CeedInt j   = (k / post) % Q;
8781ae6159SJeremy L Thompson           const CeedInt a   = k / (post * Q);
8881ae6159SJeremy L Thompson           CeedScalar    v_k = 0;
8981ae6159SJeremy L Thompson 
9081ae6159SJeremy L Thompson           for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
9181ae6159SJeremy L Thompson           out[k] = v_k;
9281ae6159SJeremy L Thompson         }
9381ae6159SJeremy L Thompson         post *= Q;
9481ae6159SJeremy L Thompson       }
9581ae6159SJeremy L Thompson 
9681ae6159SJeremy L Thompson       // Map to point
9781ae6159SJeremy L Thompson       __syncthreads();
9881ae6159SJeremy L Thompson       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
9981ae6159SJeremy L Thompson         pre  = BASIS_NUM_QPTS;
10081ae6159SJeremy L Thompson         post = 1;
10181ae6159SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
10281ae6159SJeremy L Thompson           // Update buffers used
10381ae6159SJeremy L Thompson           pre /= Q;
10481ae6159SJeremy L Thompson           const CeedScalar *in  = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1);
10581ae6159SJeremy L Thompson           CeedScalar       *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2);
10681ae6159SJeremy L Thompson 
10781ae6159SJeremy L Thompson           // Build Chebyshev polynomial values
10881ae6159SJeremy L Thompson           ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x);
10981ae6159SJeremy L Thompson 
11081ae6159SJeremy L Thompson           // Contract along middle index
11181ae6159SJeremy L Thompson           for (CeedInt a = 0; a < pre; a++) {
11281ae6159SJeremy L Thompson             for (CeedInt c = 0; c < post; c++) {
11381ae6159SJeremy L Thompson               CeedScalar v_k = 0;
11481ae6159SJeremy L Thompson 
11581ae6159SJeremy L Thompson               for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
11681ae6159SJeremy L Thompson               out[a * post + c] = v_k;
11781ae6159SJeremy L Thompson             }
11881ae6159SJeremy L Thompson           }
11981ae6159SJeremy L Thompson           post *= 1;
12081ae6159SJeremy L Thompson         }
12181ae6159SJeremy L Thompson       }
12281ae6159SJeremy L Thompson     }
12381ae6159SJeremy L Thompson   }
12481ae6159SJeremy L Thompson }
12581ae6159SJeremy L Thompson 
InterpTransposeAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ chebyshev_interp_1d,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ coords,const CeedScalar * __restrict__ u,CeedScalar * __restrict__ v)12681ae6159SJeremy L Thompson extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
12781ae6159SJeremy L Thompson                                                    const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
12881ae6159SJeremy L Thompson                                                    const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
12981ae6159SJeremy L Thompson   const CeedInt i = threadIdx.x;
13081ae6159SJeremy L Thompson 
13181ae6159SJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
13281ae6159SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
13381ae6159SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
13481ae6159SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
13581ae6159SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
13681ae6159SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
13781ae6159SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
13881ae6159SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
13981ae6159SJeremy L Thompson   }
14081ae6159SJeremy L Thompson 
14181ae6159SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
14281ae6159SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
14381ae6159SJeremy L Thompson   const CeedInt u_stride      = BASIS_NUM_PTS;
14481ae6159SJeremy L Thompson   const CeedInt v_stride      = BASIS_NUM_NODES;
14581ae6159SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
14681ae6159SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
14781ae6159SJeremy L Thompson   const CeedInt u_size        = BASIS_NUM_PTS;
14881ae6159SJeremy L Thompson 
14981ae6159SJeremy L Thompson   // Apply basis element by element
1501c21e869SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
1511c21e869SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
152db2becc9SJeremy L Thompson       const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
153db2becc9SJeremy L Thompson       CeedScalar       *cur_v = &v[elem * v_stride + comp * v_comp_stride];
1541c21e869SJeremy L Thompson       CeedInt           pre   = 1;
1551c21e869SJeremy L Thompson       CeedInt           post  = 1;
1561c21e869SJeremy L Thompson 
1571c21e869SJeremy L Thompson       // Clear Chebyshev coeffs
1581c21e869SJeremy L Thompson       for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
1591c21e869SJeremy L Thompson         s_chebyshev_coeffs[k] = 0.0;
1601c21e869SJeremy L Thompson       }
1611c21e869SJeremy L Thompson 
1621c21e869SJeremy L Thompson       // Map from point
1632d10e82cSJeremy L Thompson       __syncthreads();
1642d10e82cSJeremy L Thompson       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
165111870feSJeremy L Thompson         if (p >= points_per_elem[elem]) continue;
1661c21e869SJeremy L Thompson         pre  = 1;
1671c21e869SJeremy L Thompson         post = 1;
1681c21e869SJeremy L Thompson         for (CeedInt d = 0; d < BASIS_DIM; d++) {
1691c21e869SJeremy L Thompson           // Update buffers used
1701c21e869SJeremy L Thompson           pre /= 1;
171db2becc9SJeremy L Thompson           const CeedScalar *in  = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1);
1721c21e869SJeremy L Thompson           CeedScalar       *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2);
1731c21e869SJeremy L Thompson 
1741c21e869SJeremy L Thompson           // Build Chebyshev polynomial values
1751c21e869SJeremy L Thompson           ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x);
1761c21e869SJeremy L Thompson 
1771c21e869SJeremy L Thompson           // Contract along middle index
1781c21e869SJeremy L Thompson           for (CeedInt a = 0; a < pre; a++) {
1791c21e869SJeremy L Thompson             for (CeedInt c = 0; c < post; c++) {
1801c21e869SJeremy L Thompson               if (d == BASIS_DIM - 1) {
181ad8059fcSJeremy L Thompson                 for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
1821c21e869SJeremy L Thompson               } else {
1831c21e869SJeremy L Thompson                 for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
1841c21e869SJeremy L Thompson               }
1851c21e869SJeremy L Thompson             }
1861c21e869SJeremy L Thompson           }
1871c21e869SJeremy L Thompson           post *= Q;
1881c21e869SJeremy L Thompson         }
1891c21e869SJeremy L Thompson       }
1901c21e869SJeremy L Thompson 
1911c21e869SJeremy L Thompson       // Map from coefficients
1921c21e869SJeremy L Thompson       pre  = BASIS_NUM_QPTS;
1931c21e869SJeremy L Thompson       post = 1;
1941c21e869SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
1951c21e869SJeremy L Thompson         __syncthreads();
1961c21e869SJeremy L Thompson         // Update buffers used
1971c21e869SJeremy L Thompson         pre /= Q;
1981c21e869SJeremy L Thompson         const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
1991c21e869SJeremy L Thompson         CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
2001c21e869SJeremy L Thompson         const CeedInt     writeLen = pre * post * P;
2011c21e869SJeremy L Thompson 
2021c21e869SJeremy L Thompson         // Contract along middle index
2031c21e869SJeremy L Thompson         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
2041c21e869SJeremy L Thompson           const CeedInt c   = k % post;
2051c21e869SJeremy L Thompson           const CeedInt j   = (k / post) % P;
2061c21e869SJeremy L Thompson           const CeedInt a   = k / (post * P);
2071c21e869SJeremy L Thompson           CeedScalar    v_k = 0;
2081c21e869SJeremy L Thompson 
2091c21e869SJeremy L Thompson           for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
210db2becc9SJeremy L Thompson           if (d == BASIS_DIM - 1) out[k] += v_k;
211db2becc9SJeremy L Thompson           else out[k] = v_k;
2121c21e869SJeremy L Thompson         }
2131c21e869SJeremy L Thompson         post *= P;
2141c21e869SJeremy L Thompson       }
2151c21e869SJeremy L Thompson     }
2161c21e869SJeremy L Thompson   }
21781ae6159SJeremy L Thompson }
21881ae6159SJeremy L Thompson 
21981ae6159SJeremy L Thompson //------------------------------------------------------------------------------
22081ae6159SJeremy L Thompson // Grad
22181ae6159SJeremy L Thompson //------------------------------------------------------------------------------
GradAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ chebyshev_interp_1d,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ coords,const CeedScalar * __restrict__ u,CeedScalar * __restrict__ v)22281ae6159SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
22381ae6159SJeremy L Thompson                                         const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
22481ae6159SJeremy L Thompson                                         const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
22581ae6159SJeremy L Thompson   const CeedInt i = threadIdx.x;
22681ae6159SJeremy L Thompson 
22781ae6159SJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
22881ae6159SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
22981ae6159SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
23081ae6159SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
23181ae6159SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
23281ae6159SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
23381ae6159SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
23481ae6159SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
23581ae6159SJeremy L Thompson   }
23681ae6159SJeremy L Thompson 
23781ae6159SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
23881ae6159SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
23981ae6159SJeremy L Thompson   const CeedInt u_stride      = BASIS_NUM_NODES;
24081ae6159SJeremy L Thompson   const CeedInt v_stride      = BASIS_NUM_PTS;
24181ae6159SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES;
24281ae6159SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS;
24381ae6159SJeremy L Thompson   const CeedInt u_size        = BASIS_NUM_NODES;
24481ae6159SJeremy L Thompson   const CeedInt u_dim_stride  = 0;
24581ae6159SJeremy L Thompson   const CeedInt v_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
24681ae6159SJeremy L Thompson 
24781ae6159SJeremy L Thompson   // Apply basis element by element
2481c21e869SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
2491c21e869SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
250db2becc9SJeremy L Thompson       const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride];
2511c21e869SJeremy L Thompson       CeedInt           pre   = u_size;
2521c21e869SJeremy L Thompson       CeedInt           post  = 1;
2531c21e869SJeremy L Thompson 
2541c21e869SJeremy L Thompson       // Map to coefficients
2551c21e869SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
2561c21e869SJeremy L Thompson         __syncthreads();
2571c21e869SJeremy L Thompson         // Update buffers used
2581c21e869SJeremy L Thompson         pre /= P;
2591c21e869SJeremy L Thompson         const CeedScalar *in       = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1);
2601c21e869SJeremy L Thompson         CeedScalar       *out      = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2);
2611c21e869SJeremy L Thompson         const CeedInt     writeLen = pre * post * Q;
2621c21e869SJeremy L Thompson 
2631c21e869SJeremy L Thompson         // Contract along middle index
2641c21e869SJeremy L Thompson         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
2651c21e869SJeremy L Thompson           const CeedInt c   = k % post;
2661c21e869SJeremy L Thompson           const CeedInt j   = (k / post) % Q;
2671c21e869SJeremy L Thompson           const CeedInt a   = k / (post * Q);
2681c21e869SJeremy L Thompson           CeedScalar    v_k = 0;
2691c21e869SJeremy L Thompson 
2701c21e869SJeremy L Thompson           for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c];
2711c21e869SJeremy L Thompson           out[k] = v_k;
2721c21e869SJeremy L Thompson         }
2731c21e869SJeremy L Thompson         post *= Q;
2741c21e869SJeremy L Thompson       }
2751c21e869SJeremy L Thompson 
2761c21e869SJeremy L Thompson       // Map to point
2771c21e869SJeremy L Thompson       __syncthreads();
2782d10e82cSJeremy L Thompson       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
27981ae6159SJeremy L Thompson         for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
28081ae6159SJeremy L Thompson           CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride];
28181ae6159SJeremy L Thompson 
2821c21e869SJeremy L Thompson           pre  = BASIS_NUM_QPTS;
2831c21e869SJeremy L Thompson           post = 1;
28481ae6159SJeremy L Thompson           for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
2851c21e869SJeremy L Thompson             // Update buffers used
2861c21e869SJeremy L Thompson             pre /= Q;
28781ae6159SJeremy L Thompson             const CeedScalar *in  = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1);
28881ae6159SJeremy L Thompson             CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2);
2891c21e869SJeremy L Thompson 
2901c21e869SJeremy L Thompson             // Build Chebyshev polynomial values
29181ae6159SJeremy L Thompson             if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
29281ae6159SJeremy L Thompson             else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x);
2931c21e869SJeremy L Thompson 
2941c21e869SJeremy L Thompson             // Contract along middle index
2951c21e869SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
2961c21e869SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
2971c21e869SJeremy L Thompson                 CeedScalar v_k = 0;
2981c21e869SJeremy L Thompson 
2991c21e869SJeremy L Thompson                 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c];
3001c21e869SJeremy L Thompson                 out[a * post + c] = v_k;
3011c21e869SJeremy L Thompson               }
3021c21e869SJeremy L Thompson             }
3031c21e869SJeremy L Thompson             post *= 1;
3041c21e869SJeremy L Thompson           }
3051c21e869SJeremy L Thompson         }
3061c21e869SJeremy L Thompson       }
3071c21e869SJeremy L Thompson     }
3081c21e869SJeremy L Thompson   }
3091c21e869SJeremy L Thompson }
3101c21e869SJeremy L Thompson 
GradTransposeAtPoints(const CeedInt num_elem,const CeedScalar * __restrict__ chebyshev_interp_1d,const CeedInt * __restrict__ points_per_elem,const CeedScalar * __restrict__ coords,const CeedScalar * __restrict__ u,CeedScalar * __restrict__ v)31181ae6159SJeremy L Thompson extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d,
312111870feSJeremy L Thompson                                                  const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords,
313111870feSJeremy L Thompson                                                  const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
3141c21e869SJeremy L Thompson   const CeedInt i = threadIdx.x;
3151c21e869SJeremy L Thompson 
316f7c9815fSJeremy L Thompson   __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D];
3171c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_interp_1d = s_mem;
3181c21e869SJeremy L Thompson   CeedScalar           *s_buffer_1            = s_mem + BASIS_Q_1D * BASIS_P_1D;
3191c21e869SJeremy L Thompson   CeedScalar           *s_buffer_2            = s_buffer_1 + BASIS_BUF_LEN;
3201c21e869SJeremy L Thompson   CeedScalar           *s_chebyshev_coeffs    = s_buffer_2 + BASIS_BUF_LEN;
3211c21e869SJeremy L Thompson   CeedScalar            chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN];
3221c21e869SJeremy L Thompson   for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) {
3231c21e869SJeremy L Thompson     s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k];
3241c21e869SJeremy L Thompson   }
3251c21e869SJeremy L Thompson 
3261c21e869SJeremy L Thompson   const CeedInt P             = BASIS_P_1D;
3271c21e869SJeremy L Thompson   const CeedInt Q             = BASIS_Q_1D;
32881ae6159SJeremy L Thompson   const CeedInt u_stride      = BASIS_NUM_PTS;
32981ae6159SJeremy L Thompson   const CeedInt v_stride      = BASIS_NUM_NODES;
33081ae6159SJeremy L Thompson   const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS;
33181ae6159SJeremy L Thompson   const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES;
33281ae6159SJeremy L Thompson   const CeedInt u_size        = BASIS_NUM_PTS;
33381ae6159SJeremy L Thompson   const CeedInt u_dim_stride  = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP;
33481ae6159SJeremy L Thompson   const CeedInt v_dim_stride  = 0;
3351c21e869SJeremy L Thompson 
3361c21e869SJeremy L Thompson   // Apply basis element by element
3371c21e869SJeremy L Thompson   for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) {
3381c21e869SJeremy L Thompson     for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) {
339db2becc9SJeremy L Thompson       CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride];
3401c21e869SJeremy L Thompson       CeedInt     pre   = 1;
3411c21e869SJeremy L Thompson       CeedInt     post  = 1;
3421c21e869SJeremy L Thompson 
3431c21e869SJeremy L Thompson       // Clear Chebyshev coeffs
3441c21e869SJeremy L Thompson       for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) {
3451c21e869SJeremy L Thompson         s_chebyshev_coeffs[k] = 0.0;
3461c21e869SJeremy L Thompson       }
3471c21e869SJeremy L Thompson 
3481c21e869SJeremy L Thompson       // Map from point
3492d10e82cSJeremy L Thompson       __syncthreads();
3502d10e82cSJeremy L Thompson       for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) {
351111870feSJeremy L Thompson         if (p >= points_per_elem[elem]) continue;
3521c21e869SJeremy L Thompson         for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) {
353db2becc9SJeremy L Thompson           const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride];
3541c21e869SJeremy L Thompson 
3551c21e869SJeremy L Thompson           pre  = 1;
3561c21e869SJeremy L Thompson           post = 1;
3571c21e869SJeremy L Thompson           for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) {
3581c21e869SJeremy L Thompson             // Update buffers used
3591c21e869SJeremy L Thompson             pre /= 1;
360db2becc9SJeremy L Thompson             const CeedScalar *in  = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1);
3611c21e869SJeremy L Thompson             CeedScalar       *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2);
3621c21e869SJeremy L Thompson 
3631c21e869SJeremy L Thompson             // Build Chebyshev polynomial values
3641c21e869SJeremy L Thompson             if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
3651c21e869SJeremy L Thompson             else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x);
3661c21e869SJeremy L Thompson 
3671c21e869SJeremy L Thompson             // Contract along middle index
3681c21e869SJeremy L Thompson             for (CeedInt a = 0; a < pre; a++) {
3691c21e869SJeremy L Thompson               for (CeedInt c = 0; c < post; c++) {
3701c21e869SJeremy L Thompson                 if (dim_2 == BASIS_DIM - 1) {
371ad8059fcSJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]);
3721c21e869SJeremy L Thompson                 } else {
3731c21e869SJeremy L Thompson                   for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c];
3741c21e869SJeremy L Thompson                 }
3751c21e869SJeremy L Thompson               }
3761c21e869SJeremy L Thompson             }
3771c21e869SJeremy L Thompson             post *= Q;
3781c21e869SJeremy L Thompson           }
3791c21e869SJeremy L Thompson         }
3801c21e869SJeremy L Thompson       }
3811c21e869SJeremy L Thompson 
3821c21e869SJeremy L Thompson       // Map from coefficients
3831c21e869SJeremy L Thompson       pre  = BASIS_NUM_QPTS;
3841c21e869SJeremy L Thompson       post = 1;
3851c21e869SJeremy L Thompson       for (CeedInt d = 0; d < BASIS_DIM; d++) {
3861c21e869SJeremy L Thompson         __syncthreads();
3871c21e869SJeremy L Thompson         // Update buffers used
3881c21e869SJeremy L Thompson         pre /= Q;
3891c21e869SJeremy L Thompson         const CeedScalar *in       = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1);
3901c21e869SJeremy L Thompson         CeedScalar       *out      = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2);
3911c21e869SJeremy L Thompson         const CeedInt     writeLen = pre * post * P;
3921c21e869SJeremy L Thompson 
3931c21e869SJeremy L Thompson         // Contract along middle index
3941c21e869SJeremy L Thompson         for (CeedInt k = i; k < writeLen; k += blockDim.x) {
3951c21e869SJeremy L Thompson           const CeedInt c   = k % post;
3961c21e869SJeremy L Thompson           const CeedInt j   = (k / post) % P;
3971c21e869SJeremy L Thompson           const CeedInt a   = k / (post * P);
3981c21e869SJeremy L Thompson           CeedScalar    v_k = 0;
3991c21e869SJeremy L Thompson 
4001c21e869SJeremy L Thompson           for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c];
401db2becc9SJeremy L Thompson           if (d == BASIS_DIM - 1) out[k] += v_k;
402db2becc9SJeremy L Thompson           else out[k] = v_k;
4031c21e869SJeremy L Thompson         }
4041c21e869SJeremy L Thompson         post *= P;
4051c21e869SJeremy L Thompson       }
4061c21e869SJeremy L Thompson     }
4071c21e869SJeremy L Thompson   }
4081c21e869SJeremy L Thompson }
409