134d14614SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 234d14614SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 334d14614SJeremy L Thompson // 434d14614SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 534d14614SJeremy L Thompson // 634d14614SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 734d14614SJeremy L Thompson 834d14614SJeremy L Thompson /// @file 934d14614SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation 1034d14614SJeremy L Thompson 1134d14614SJeremy L Thompson #include <ceed.h> 1234d14614SJeremy L Thompson 1334d14614SJeremy L Thompson //------------------------------------------------------------------------------ 1434d14614SJeremy L Thompson // Chebyshev values 1534d14614SJeremy L Thompson //------------------------------------------------------------------------------ 1634d14614SJeremy L Thompson template <int Q_1D> 1734d14614SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 1834d14614SJeremy L Thompson chebyshev_x[0] = 1.0; 1934d14614SJeremy L Thompson chebyshev_x[1] = 2 * x; 2034d14614SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 2134d14614SJeremy L Thompson } 2234d14614SJeremy L Thompson 2334d14614SJeremy L Thompson template <int Q_1D> 2434d14614SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 2534d14614SJeremy L Thompson CeedScalar chebyshev_x[3]; 2634d14614SJeremy L Thompson 2734d14614SJeremy L Thompson chebyshev_x[1] = 1.0; 2834d14614SJeremy L Thompson chebyshev_x[2] = 2 * x; 2934d14614SJeremy L Thompson chebyshev_dx[0] = 0.0; 3034d14614SJeremy L Thompson chebyshev_dx[1] = 2.0; 3134d14614SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 32*80c135a8SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 33*80c135a8SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 3434d14614SJeremy L Thompson } 3534d14614SJeremy L Thompson } 3634d14614SJeremy L Thompson 3734d14614SJeremy L Thompson //------------------------------------------------------------------------------ 3834d14614SJeremy L Thompson // Tensor Basis Kernels AtPoints 3934d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4034d14614SJeremy L Thompson 4134d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4234d14614SJeremy L Thompson // Interp 4334d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4434d14614SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 4534d14614SJeremy L Thompson const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 4634d14614SJeremy L Thompson const CeedInt i = threadIdx.x; 4734d14614SJeremy L Thompson 48f7c9815fSJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 4934d14614SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 5034d14614SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 5134d14614SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 5234d14614SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 5334d14614SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 5434d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 5534d14614SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 5634d14614SJeremy L Thompson } 5734d14614SJeremy L Thompson 5834d14614SJeremy L Thompson const CeedInt P = BASIS_P_1D; 5934d14614SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 6034d14614SJeremy L Thompson const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 6134d14614SJeremy L Thompson const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 6234d14614SJeremy L Thompson const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 6334d14614SJeremy L Thompson const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 6434d14614SJeremy L Thompson const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 6534d14614SJeremy L Thompson 6634d14614SJeremy L Thompson // Apply basis element by element 6734d14614SJeremy L Thompson if (is_transpose) { 6834d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 6934d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 7034d14614SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 7134d14614SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 7234d14614SJeremy L Thompson CeedInt pre = 1; 7334d14614SJeremy L Thompson CeedInt post = 1; 7434d14614SJeremy L Thompson 7534d14614SJeremy L Thompson // Clear Chebyshev coeffs 7634d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 7734d14614SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 7834d14614SJeremy L Thompson } 7934d14614SJeremy L Thompson 8034d14614SJeremy L Thompson // Map from point 812d10e82cSJeremy L Thompson __syncthreads(); 822d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 8334d14614SJeremy L Thompson pre = 1; 8434d14614SJeremy L Thompson post = 1; 8534d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 8634d14614SJeremy L Thompson // Update buffers used 8734d14614SJeremy L Thompson pre /= 1; 8834d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1); 8934d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); 9034d14614SJeremy L Thompson 9134d14614SJeremy L Thompson // Build Chebyshev polynomial values 9234d14614SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); 9334d14614SJeremy L Thompson 9434d14614SJeremy L Thompson // Contract along middle index 9534d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 9634d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 9734d14614SJeremy L Thompson if (d == BASIS_DIM - 1) { 98ad8059fcSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 9934d14614SJeremy L Thompson } else { 10034d14614SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 10134d14614SJeremy L Thompson } 10234d14614SJeremy L Thompson } 10334d14614SJeremy L Thompson } 10434d14614SJeremy L Thompson post *= Q; 10534d14614SJeremy L Thompson } 10634d14614SJeremy L Thompson } 10734d14614SJeremy L Thompson 10834d14614SJeremy L Thompson // Map from coefficients 10934d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 11034d14614SJeremy L Thompson post = 1; 11134d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 11234d14614SJeremy L Thompson __syncthreads(); 11334d14614SJeremy L Thompson // Update buffers used 11434d14614SJeremy L Thompson pre /= Q; 11534d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 11634d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 11734d14614SJeremy L Thompson const CeedInt writeLen = pre * post * P; 11834d14614SJeremy L Thompson 11934d14614SJeremy L Thompson // Contract along middle index 12034d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 12134d14614SJeremy L Thompson const CeedInt c = k % post; 12234d14614SJeremy L Thompson const CeedInt j = (k / post) % P; 12334d14614SJeremy L Thompson const CeedInt a = k / (post * P); 12434d14614SJeremy L Thompson CeedScalar v_k = 0; 12534d14614SJeremy L Thompson 12634d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 12734d14614SJeremy L Thompson out[k] = v_k; 12834d14614SJeremy L Thompson } 12934d14614SJeremy L Thompson post *= P; 13034d14614SJeremy L Thompson } 13134d14614SJeremy L Thompson } 13234d14614SJeremy L Thompson } 13334d14614SJeremy L Thompson } else { 13434d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 13534d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 13634d14614SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 13734d14614SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 13834d14614SJeremy L Thompson CeedInt pre = u_size; 13934d14614SJeremy L Thompson CeedInt post = 1; 14034d14614SJeremy L Thompson 14134d14614SJeremy L Thompson // Map to coefficients 14234d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 14334d14614SJeremy L Thompson __syncthreads(); 14434d14614SJeremy L Thompson // Update buffers used 14534d14614SJeremy L Thompson pre /= P; 14634d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 14734d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 14834d14614SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 14934d14614SJeremy L Thompson 15034d14614SJeremy L Thompson // Contract along middle index 15134d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 15234d14614SJeremy L Thompson const CeedInt c = k % post; 15334d14614SJeremy L Thompson const CeedInt j = (k / post) % Q; 15434d14614SJeremy L Thompson const CeedInt a = k / (post * Q); 15534d14614SJeremy L Thompson CeedScalar v_k = 0; 15634d14614SJeremy L Thompson 15734d14614SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 15834d14614SJeremy L Thompson out[k] = v_k; 15934d14614SJeremy L Thompson } 16034d14614SJeremy L Thompson post *= Q; 16134d14614SJeremy L Thompson } 16234d14614SJeremy L Thompson 16334d14614SJeremy L Thompson // Map to point 16434d14614SJeremy L Thompson __syncthreads(); 1652d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 16634d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 16734d14614SJeremy L Thompson post = 1; 16834d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 16934d14614SJeremy L Thompson // Update buffers used 17034d14614SJeremy L Thompson pre /= Q; 17134d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); 17234d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2); 17334d14614SJeremy L Thompson 17434d14614SJeremy L Thompson // Build Chebyshev polynomial values 17534d14614SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); 17634d14614SJeremy L Thompson 17734d14614SJeremy L Thompson // Contract along middle index 17834d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 17934d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 18034d14614SJeremy L Thompson CeedScalar v_k = 0; 18134d14614SJeremy L Thompson 18234d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 18334d14614SJeremy L Thompson out[a * post + c] = v_k; 18434d14614SJeremy L Thompson } 18534d14614SJeremy L Thompson } 18634d14614SJeremy L Thompson post *= 1; 18734d14614SJeremy L Thompson } 18834d14614SJeremy L Thompson } 18934d14614SJeremy L Thompson } 19034d14614SJeremy L Thompson } 19134d14614SJeremy L Thompson } 19234d14614SJeremy L Thompson } 19334d14614SJeremy L Thompson 19434d14614SJeremy L Thompson //------------------------------------------------------------------------------ 19534d14614SJeremy L Thompson // Grad 19634d14614SJeremy L Thompson //------------------------------------------------------------------------------ 19734d14614SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 19834d14614SJeremy L Thompson const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 19934d14614SJeremy L Thompson const CeedInt i = threadIdx.x; 20034d14614SJeremy L Thompson 201f7c9815fSJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 20234d14614SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 20334d14614SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 20434d14614SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 20534d14614SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 20634d14614SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 20734d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 20834d14614SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 20934d14614SJeremy L Thompson } 21034d14614SJeremy L Thompson 21134d14614SJeremy L Thompson const CeedInt P = BASIS_P_1D; 21234d14614SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 21334d14614SJeremy L Thompson const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 21434d14614SJeremy L Thompson const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 21534d14614SJeremy L Thompson const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 21634d14614SJeremy L Thompson const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 21734d14614SJeremy L Thompson const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 21834d14614SJeremy L Thompson const CeedInt u_dim_stride = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0; 21934d14614SJeremy L Thompson const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 22034d14614SJeremy L Thompson 22134d14614SJeremy L Thompson // Apply basis element by element 22234d14614SJeremy L Thompson if (is_transpose) { 22334d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 22434d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 22534d14614SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 22634d14614SJeremy L Thompson CeedInt pre = 1; 22734d14614SJeremy L Thompson CeedInt post = 1; 22834d14614SJeremy L Thompson 22934d14614SJeremy L Thompson // Clear Chebyshev coeffs 23034d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 23134d14614SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 23234d14614SJeremy L Thompson } 23334d14614SJeremy L Thompson 23434d14614SJeremy L Thompson // Map from point 2352d10e82cSJeremy L Thompson __syncthreads(); 2362d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 23734d14614SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 23834d14614SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; 23934d14614SJeremy L Thompson 24034d14614SJeremy L Thompson pre = 1; 24134d14614SJeremy L Thompson post = 1; 24234d14614SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 24334d14614SJeremy L Thompson // Update buffers used 24434d14614SJeremy L Thompson pre /= 1; 24534d14614SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); 24634d14614SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); 24734d14614SJeremy L Thompson 24834d14614SJeremy L Thompson // Build Chebyshev polynomial values 24934d14614SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 25034d14614SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 25134d14614SJeremy L Thompson 25234d14614SJeremy L Thompson // Contract along middle index 25334d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 25434d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 25534d14614SJeremy L Thompson if (dim_2 == BASIS_DIM - 1) { 256ad8059fcSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 25734d14614SJeremy L Thompson } else { 25834d14614SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 25934d14614SJeremy L Thompson } 26034d14614SJeremy L Thompson } 26134d14614SJeremy L Thompson } 26234d14614SJeremy L Thompson post *= Q; 26334d14614SJeremy L Thompson } 26434d14614SJeremy L Thompson } 26534d14614SJeremy L Thompson } 26634d14614SJeremy L Thompson 26734d14614SJeremy L Thompson // Map from coefficients 26834d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 26934d14614SJeremy L Thompson post = 1; 27034d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 27134d14614SJeremy L Thompson __syncthreads(); 27234d14614SJeremy L Thompson // Update buffers used 27334d14614SJeremy L Thompson pre /= Q; 27434d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 27534d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 27634d14614SJeremy L Thompson const CeedInt writeLen = pre * post * P; 27734d14614SJeremy L Thompson 27834d14614SJeremy L Thompson // Contract along middle index 27934d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 28034d14614SJeremy L Thompson const CeedInt c = k % post; 28134d14614SJeremy L Thompson const CeedInt j = (k / post) % P; 28234d14614SJeremy L Thompson const CeedInt a = k / (post * P); 28334d14614SJeremy L Thompson CeedScalar v_k = 0; 28434d14614SJeremy L Thompson 28534d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 28634d14614SJeremy L Thompson out[k] = v_k; 28734d14614SJeremy L Thompson } 28834d14614SJeremy L Thompson post *= P; 28934d14614SJeremy L Thompson } 29034d14614SJeremy L Thompson } 29134d14614SJeremy L Thompson } 29234d14614SJeremy L Thompson } else { 29334d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 29434d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 29534d14614SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 29634d14614SJeremy L Thompson CeedInt pre = u_size; 29734d14614SJeremy L Thompson CeedInt post = 1; 29834d14614SJeremy L Thompson 29934d14614SJeremy L Thompson // Map to coefficients 30034d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 30134d14614SJeremy L Thompson __syncthreads(); 30234d14614SJeremy L Thompson // Update buffers used 30334d14614SJeremy L Thompson pre /= P; 30434d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 30534d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 30634d14614SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 30734d14614SJeremy L Thompson 30834d14614SJeremy L Thompson // Contract along middle index 30934d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 31034d14614SJeremy L Thompson const CeedInt c = k % post; 31134d14614SJeremy L Thompson const CeedInt j = (k / post) % Q; 31234d14614SJeremy L Thompson const CeedInt a = k / (post * Q); 31334d14614SJeremy L Thompson CeedScalar v_k = 0; 31434d14614SJeremy L Thompson 31534d14614SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 31634d14614SJeremy L Thompson out[k] = v_k; 31734d14614SJeremy L Thompson } 31834d14614SJeremy L Thompson post *= Q; 31934d14614SJeremy L Thompson } 32034d14614SJeremy L Thompson 32134d14614SJeremy L Thompson // Map to point 32234d14614SJeremy L Thompson __syncthreads(); 3232d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 32434d14614SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 32534d14614SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; 32634d14614SJeremy L Thompson 32734d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 32834d14614SJeremy L Thompson post = 1; 32934d14614SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 33034d14614SJeremy L Thompson // Update buffers used 33134d14614SJeremy L Thompson pre /= Q; 33234d14614SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); 33334d14614SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); 33434d14614SJeremy L Thompson 33534d14614SJeremy L Thompson // Build Chebyshev polynomial values 33634d14614SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 33734d14614SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 33834d14614SJeremy L Thompson 33934d14614SJeremy L Thompson // Contract along middle index 34034d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 34134d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 34234d14614SJeremy L Thompson CeedScalar v_k = 0; 34334d14614SJeremy L Thompson 34434d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 34534d14614SJeremy L Thompson out[a * post + c] = v_k; 34634d14614SJeremy L Thompson } 34734d14614SJeremy L Thompson } 34834d14614SJeremy L Thompson post *= 1; 34934d14614SJeremy L Thompson } 35034d14614SJeremy L Thompson } 35134d14614SJeremy L Thompson } 35234d14614SJeremy L Thompson } 35334d14614SJeremy L Thompson } 35434d14614SJeremy L Thompson } 35534d14614SJeremy L Thompson } 356