134d14614SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 234d14614SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 334d14614SJeremy L Thompson // 434d14614SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 534d14614SJeremy L Thompson // 634d14614SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 734d14614SJeremy L Thompson 834d14614SJeremy L Thompson /// @file 934d14614SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation 1034d14614SJeremy L Thompson 1134d14614SJeremy L Thompson #include <ceed.h> 1234d14614SJeremy L Thompson 1334d14614SJeremy L Thompson //------------------------------------------------------------------------------ 1434d14614SJeremy L Thompson // Chebyshev values 1534d14614SJeremy L Thompson //------------------------------------------------------------------------------ 1634d14614SJeremy L Thompson template <int Q_1D> 1734d14614SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 1834d14614SJeremy L Thompson chebyshev_x[0] = 1.0; 1934d14614SJeremy L Thompson chebyshev_x[1] = 2 * x; 2034d14614SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 2134d14614SJeremy L Thompson } 2234d14614SJeremy L Thompson 2334d14614SJeremy L Thompson template <int Q_1D> 2434d14614SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 2534d14614SJeremy L Thompson CeedScalar chebyshev_x[3]; 2634d14614SJeremy L Thompson 2734d14614SJeremy L Thompson chebyshev_x[1] = 1.0; 2834d14614SJeremy L Thompson chebyshev_x[2] = 2 * x; 2934d14614SJeremy L Thompson chebyshev_dx[0] = 0.0; 3034d14614SJeremy L Thompson chebyshev_dx[1] = 2.0; 3134d14614SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 3234d14614SJeremy L Thompson chebyshev_x[0] = chebyshev_x[1]; 3334d14614SJeremy L Thompson chebyshev_x[1] = chebyshev_x[2]; 3434d14614SJeremy L Thompson chebyshev_x[2] = 2 * x * chebyshev_x[1] - chebyshev_x[0]; 3534d14614SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[1] - chebyshev_dx[i - 2]; 3634d14614SJeremy L Thompson } 3734d14614SJeremy L Thompson } 3834d14614SJeremy L Thompson 3934d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4034d14614SJeremy L Thompson // Tensor Basis Kernels AtPoints 4134d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4234d14614SJeremy L Thompson 4334d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4434d14614SJeremy L Thompson // Interp 4534d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4634d14614SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 4734d14614SJeremy L Thompson const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 4834d14614SJeremy L Thompson const CeedInt i = threadIdx.x; 4934d14614SJeremy L Thompson 5034d14614SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN]; 5134d14614SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 5234d14614SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 5334d14614SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 5434d14614SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 5534d14614SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 5634d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 5734d14614SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 5834d14614SJeremy L Thompson } 5934d14614SJeremy L Thompson 6034d14614SJeremy L Thompson const CeedInt P = BASIS_P_1D; 6134d14614SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 6234d14614SJeremy L Thompson const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 6334d14614SJeremy L Thompson const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 6434d14614SJeremy L Thompson const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 6534d14614SJeremy L Thompson const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 6634d14614SJeremy L Thompson const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 6734d14614SJeremy L Thompson 6834d14614SJeremy L Thompson // Apply basis element by element 6934d14614SJeremy L Thompson if (is_transpose) { 7034d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 7134d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 7234d14614SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 7334d14614SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 7434d14614SJeremy L Thompson CeedInt pre = 1; 7534d14614SJeremy L Thompson CeedInt post = 1; 7634d14614SJeremy L Thompson 7734d14614SJeremy L Thompson // Clear Chebyshev coeffs 7834d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 7934d14614SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 8034d14614SJeremy L Thompson } 8134d14614SJeremy L Thompson 8234d14614SJeremy L Thompson // Map from point 83*2d10e82cSJeremy L Thompson __syncthreads(); 84*2d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 8534d14614SJeremy L Thompson pre = 1; 8634d14614SJeremy L Thompson post = 1; 8734d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 8834d14614SJeremy L Thompson // Update buffers used 8934d14614SJeremy L Thompson pre /= 1; 9034d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1); 9134d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); 9234d14614SJeremy L Thompson 9334d14614SJeremy L Thompson // Build Chebyshev polynomial values 9434d14614SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); 9534d14614SJeremy L Thompson 9634d14614SJeremy L Thompson // Contract along middle index 9734d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 9834d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 9934d14614SJeremy L Thompson if (d == BASIS_DIM - 1) { 100*2d10e82cSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]); 10134d14614SJeremy L Thompson } else { 10234d14614SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 10334d14614SJeremy L Thompson } 10434d14614SJeremy L Thompson } 10534d14614SJeremy L Thompson } 10634d14614SJeremy L Thompson post *= Q; 10734d14614SJeremy L Thompson } 10834d14614SJeremy L Thompson } 10934d14614SJeremy L Thompson 11034d14614SJeremy L Thompson // Map from coefficients 11134d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 11234d14614SJeremy L Thompson post = 1; 11334d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 11434d14614SJeremy L Thompson __syncthreads(); 11534d14614SJeremy L Thompson // Update buffers used 11634d14614SJeremy L Thompson pre /= Q; 11734d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 11834d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 11934d14614SJeremy L Thompson const CeedInt writeLen = pre * post * P; 12034d14614SJeremy L Thompson 12134d14614SJeremy L Thompson // Contract along middle index 12234d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 12334d14614SJeremy L Thompson const CeedInt c = k % post; 12434d14614SJeremy L Thompson const CeedInt j = (k / post) % P; 12534d14614SJeremy L Thompson const CeedInt a = k / (post * P); 12634d14614SJeremy L Thompson CeedScalar v_k = 0; 12734d14614SJeremy L Thompson 12834d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 12934d14614SJeremy L Thompson out[k] = v_k; 13034d14614SJeremy L Thompson } 13134d14614SJeremy L Thompson post *= P; 13234d14614SJeremy L Thompson } 13334d14614SJeremy L Thompson } 13434d14614SJeremy L Thompson } 13534d14614SJeremy L Thompson } else { 13634d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 13734d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 13834d14614SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 13934d14614SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 14034d14614SJeremy L Thompson CeedInt pre = u_size; 14134d14614SJeremy L Thompson CeedInt post = 1; 14234d14614SJeremy L Thompson 14334d14614SJeremy L Thompson // Map to coefficients 14434d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 14534d14614SJeremy L Thompson __syncthreads(); 14634d14614SJeremy L Thompson // Update buffers used 14734d14614SJeremy L Thompson pre /= P; 14834d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 14934d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 15034d14614SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 15134d14614SJeremy L Thompson 15234d14614SJeremy L Thompson // Contract along middle index 15334d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 15434d14614SJeremy L Thompson const CeedInt c = k % post; 15534d14614SJeremy L Thompson const CeedInt j = (k / post) % Q; 15634d14614SJeremy L Thompson const CeedInt a = k / (post * Q); 15734d14614SJeremy L Thompson CeedScalar v_k = 0; 15834d14614SJeremy L Thompson 15934d14614SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 16034d14614SJeremy L Thompson out[k] = v_k; 16134d14614SJeremy L Thompson } 16234d14614SJeremy L Thompson post *= Q; 16334d14614SJeremy L Thompson } 16434d14614SJeremy L Thompson 16534d14614SJeremy L Thompson // Map to point 16634d14614SJeremy L Thompson __syncthreads(); 167*2d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 16834d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 16934d14614SJeremy L Thompson post = 1; 17034d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 17134d14614SJeremy L Thompson // Update buffers used 17234d14614SJeremy L Thompson pre /= Q; 17334d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); 17434d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2); 17534d14614SJeremy L Thompson 17634d14614SJeremy L Thompson // Build Chebyshev polynomial values 17734d14614SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); 17834d14614SJeremy L Thompson 17934d14614SJeremy L Thompson // Contract along middle index 18034d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 18134d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 18234d14614SJeremy L Thompson CeedScalar v_k = 0; 18334d14614SJeremy L Thompson 18434d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 18534d14614SJeremy L Thompson out[a * post + c] = v_k; 18634d14614SJeremy L Thompson } 18734d14614SJeremy L Thompson } 18834d14614SJeremy L Thompson post *= 1; 18934d14614SJeremy L Thompson } 19034d14614SJeremy L Thompson } 19134d14614SJeremy L Thompson } 19234d14614SJeremy L Thompson } 19334d14614SJeremy L Thompson } 19434d14614SJeremy L Thompson } 19534d14614SJeremy L Thompson 19634d14614SJeremy L Thompson //------------------------------------------------------------------------------ 19734d14614SJeremy L Thompson // Grad 19834d14614SJeremy L Thompson //------------------------------------------------------------------------------ 19934d14614SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 20034d14614SJeremy L Thompson const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 20134d14614SJeremy L Thompson const CeedInt i = threadIdx.x; 20234d14614SJeremy L Thompson 20334d14614SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN]; 20434d14614SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 20534d14614SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 20634d14614SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 20734d14614SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 20834d14614SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 20934d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 21034d14614SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 21134d14614SJeremy L Thompson } 21234d14614SJeremy L Thompson 21334d14614SJeremy L Thompson const CeedInt P = BASIS_P_1D; 21434d14614SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 21534d14614SJeremy L Thompson const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 21634d14614SJeremy L Thompson const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 21734d14614SJeremy L Thompson const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 21834d14614SJeremy L Thompson const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 21934d14614SJeremy L Thompson const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 22034d14614SJeremy L Thompson const CeedInt u_dim_stride = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0; 22134d14614SJeremy L Thompson const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 22234d14614SJeremy L Thompson 22334d14614SJeremy L Thompson // Apply basis element by element 22434d14614SJeremy L Thompson if (is_transpose) { 22534d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 22634d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 22734d14614SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 22834d14614SJeremy L Thompson CeedInt pre = 1; 22934d14614SJeremy L Thompson CeedInt post = 1; 23034d14614SJeremy L Thompson 23134d14614SJeremy L Thompson // Clear Chebyshev coeffs 23234d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 23334d14614SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 23434d14614SJeremy L Thompson } 23534d14614SJeremy L Thompson 23634d14614SJeremy L Thompson // Map from point 237*2d10e82cSJeremy L Thompson __syncthreads(); 238*2d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 23934d14614SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 24034d14614SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; 24134d14614SJeremy L Thompson 24234d14614SJeremy L Thompson pre = 1; 24334d14614SJeremy L Thompson post = 1; 24434d14614SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 24534d14614SJeremy L Thompson // Update buffers used 24634d14614SJeremy L Thompson pre /= 1; 24734d14614SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); 24834d14614SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); 24934d14614SJeremy L Thompson 25034d14614SJeremy L Thompson // Build Chebyshev polynomial values 25134d14614SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 25234d14614SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 25334d14614SJeremy L Thompson 25434d14614SJeremy L Thompson // Contract along middle index 25534d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 25634d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 25734d14614SJeremy L Thompson if (dim_2 == BASIS_DIM - 1) { 258*2d10e82cSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]); 25934d14614SJeremy L Thompson } else { 26034d14614SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 26134d14614SJeremy L Thompson } 26234d14614SJeremy L Thompson } 26334d14614SJeremy L Thompson } 26434d14614SJeremy L Thompson post *= Q; 26534d14614SJeremy L Thompson } 26634d14614SJeremy L Thompson } 26734d14614SJeremy L Thompson } 26834d14614SJeremy L Thompson 26934d14614SJeremy L Thompson // Map from coefficients 27034d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 27134d14614SJeremy L Thompson post = 1; 27234d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 27334d14614SJeremy L Thompson __syncthreads(); 27434d14614SJeremy L Thompson // Update buffers used 27534d14614SJeremy L Thompson pre /= Q; 27634d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 27734d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 27834d14614SJeremy L Thompson const CeedInt writeLen = pre * post * P; 27934d14614SJeremy L Thompson 28034d14614SJeremy L Thompson // Contract along middle index 28134d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 28234d14614SJeremy L Thompson const CeedInt c = k % post; 28334d14614SJeremy L Thompson const CeedInt j = (k / post) % P; 28434d14614SJeremy L Thompson const CeedInt a = k / (post * P); 28534d14614SJeremy L Thompson CeedScalar v_k = 0; 28634d14614SJeremy L Thompson 28734d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 28834d14614SJeremy L Thompson out[k] = v_k; 28934d14614SJeremy L Thompson } 29034d14614SJeremy L Thompson post *= P; 29134d14614SJeremy L Thompson } 29234d14614SJeremy L Thompson } 29334d14614SJeremy L Thompson } 29434d14614SJeremy L Thompson } else { 29534d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 29634d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 29734d14614SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 29834d14614SJeremy L Thompson CeedInt pre = u_size; 29934d14614SJeremy L Thompson CeedInt post = 1; 30034d14614SJeremy L Thompson 30134d14614SJeremy L Thompson // Map to coefficients 30234d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 30334d14614SJeremy L Thompson __syncthreads(); 30434d14614SJeremy L Thompson // Update buffers used 30534d14614SJeremy L Thompson pre /= P; 30634d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 30734d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 30834d14614SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 30934d14614SJeremy L Thompson 31034d14614SJeremy L Thompson // Contract along middle index 31134d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 31234d14614SJeremy L Thompson const CeedInt c = k % post; 31334d14614SJeremy L Thompson const CeedInt j = (k / post) % Q; 31434d14614SJeremy L Thompson const CeedInt a = k / (post * Q); 31534d14614SJeremy L Thompson CeedScalar v_k = 0; 31634d14614SJeremy L Thompson 31734d14614SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 31834d14614SJeremy L Thompson out[k] = v_k; 31934d14614SJeremy L Thompson } 32034d14614SJeremy L Thompson post *= Q; 32134d14614SJeremy L Thompson } 32234d14614SJeremy L Thompson 32334d14614SJeremy L Thompson // Map to point 32434d14614SJeremy L Thompson __syncthreads(); 325*2d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 32634d14614SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 32734d14614SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; 32834d14614SJeremy L Thompson 32934d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 33034d14614SJeremy L Thompson post = 1; 33134d14614SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 33234d14614SJeremy L Thompson // Update buffers used 33334d14614SJeremy L Thompson pre /= Q; 33434d14614SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); 33534d14614SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); 33634d14614SJeremy L Thompson 33734d14614SJeremy L Thompson // Build Chebyshev polynomial values 33834d14614SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 33934d14614SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 34034d14614SJeremy L Thompson 34134d14614SJeremy L Thompson // Contract along middle index 34234d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 34334d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 34434d14614SJeremy L Thompson CeedScalar v_k = 0; 34534d14614SJeremy L Thompson 34634d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 34734d14614SJeremy L Thompson out[a * post + c] = v_k; 34834d14614SJeremy L Thompson } 34934d14614SJeremy L Thompson } 35034d14614SJeremy L Thompson post *= 1; 35134d14614SJeremy L Thompson } 35234d14614SJeremy L Thompson } 35334d14614SJeremy L Thompson } 35434d14614SJeremy L Thompson } 35534d14614SJeremy L Thompson } 35634d14614SJeremy L Thompson } 35734d14614SJeremy L Thompson } 358