11c21e869SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 21c21e869SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 31c21e869SJeremy L Thompson // 41c21e869SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 51c21e869SJeremy L Thompson // 61c21e869SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 71c21e869SJeremy L Thompson 81c21e869SJeremy L Thompson /// @file 91c21e869SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation 101c21e869SJeremy L Thompson 111c21e869SJeremy L Thompson #include <ceed.h> 121c21e869SJeremy L Thompson 131c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 141c21e869SJeremy L Thompson // Chebyshev values 151c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 161c21e869SJeremy L Thompson template <int Q_1D> 171c21e869SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 181c21e869SJeremy L Thompson chebyshev_x[0] = 1.0; 191c21e869SJeremy L Thompson chebyshev_x[1] = 2 * x; 201c21e869SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 211c21e869SJeremy L Thompson } 221c21e869SJeremy L Thompson 231c21e869SJeremy L Thompson template <int Q_1D> 241c21e869SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 251c21e869SJeremy L Thompson CeedScalar chebyshev_x[3]; 261c21e869SJeremy L Thompson 271c21e869SJeremy L Thompson chebyshev_x[1] = 1.0; 281c21e869SJeremy L Thompson chebyshev_x[2] = 2 * x; 291c21e869SJeremy L Thompson chebyshev_dx[0] = 0.0; 301c21e869SJeremy L Thompson chebyshev_dx[1] = 2.0; 311c21e869SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 321c21e869SJeremy L Thompson chebyshev_x[0] = chebyshev_x[1]; 331c21e869SJeremy L Thompson chebyshev_x[1] = chebyshev_x[2]; 341c21e869SJeremy L Thompson chebyshev_x[2] = 2 * x * chebyshev_x[1] - chebyshev_x[0]; 351c21e869SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[1] - chebyshev_dx[i - 2]; 361c21e869SJeremy L Thompson } 371c21e869SJeremy L Thompson } 381c21e869SJeremy L Thompson 391c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 401c21e869SJeremy L Thompson // Tensor Basis Kernels AtPoints 411c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 421c21e869SJeremy L Thompson 431c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 441c21e869SJeremy L Thompson // Interp 451c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 461c21e869SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 471c21e869SJeremy L Thompson const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 481c21e869SJeremy L Thompson const CeedInt i = threadIdx.x; 491c21e869SJeremy L Thompson 501c21e869SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN]; 511c21e869SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 521c21e869SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 531c21e869SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 541c21e869SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 551c21e869SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 561c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 571c21e869SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 581c21e869SJeremy L Thompson } 591c21e869SJeremy L Thompson 601c21e869SJeremy L Thompson const CeedInt P = BASIS_P_1D; 611c21e869SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 621c21e869SJeremy L Thompson const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 631c21e869SJeremy L Thompson const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 641c21e869SJeremy L Thompson const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 651c21e869SJeremy L Thompson const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 661c21e869SJeremy L Thompson const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 671c21e869SJeremy L Thompson 681c21e869SJeremy L Thompson // Apply basis element by element 691c21e869SJeremy L Thompson if (is_transpose) { 701c21e869SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 711c21e869SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 721c21e869SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 731c21e869SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 741c21e869SJeremy L Thompson CeedInt pre = 1; 751c21e869SJeremy L Thompson CeedInt post = 1; 761c21e869SJeremy L Thompson 771c21e869SJeremy L Thompson // Clear Chebyshev coeffs 781c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 791c21e869SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 801c21e869SJeremy L Thompson } 811c21e869SJeremy L Thompson 821c21e869SJeremy L Thompson // Map from point 83*2d10e82cSJeremy L Thompson __syncthreads(); 84*2d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 851c21e869SJeremy L Thompson pre = 1; 861c21e869SJeremy L Thompson post = 1; 871c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 881c21e869SJeremy L Thompson // Update buffers used 891c21e869SJeremy L Thompson pre /= 1; 901c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1); 911c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); 921c21e869SJeremy L Thompson 931c21e869SJeremy L Thompson // Build Chebyshev polynomial values 941c21e869SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); 951c21e869SJeremy L Thompson 961c21e869SJeremy L Thompson // Contract along middle index 971c21e869SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 981c21e869SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 991c21e869SJeremy L Thompson if (d == BASIS_DIM - 1) { 100*2d10e82cSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]); 1011c21e869SJeremy L Thompson } else { 1021c21e869SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 1031c21e869SJeremy L Thompson } 1041c21e869SJeremy L Thompson } 1051c21e869SJeremy L Thompson } 1061c21e869SJeremy L Thompson post *= Q; 1071c21e869SJeremy L Thompson } 1081c21e869SJeremy L Thompson } 1091c21e869SJeremy L Thompson 1101c21e869SJeremy L Thompson // Map from coefficients 1111c21e869SJeremy L Thompson pre = BASIS_NUM_QPTS; 1121c21e869SJeremy L Thompson post = 1; 1131c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 1141c21e869SJeremy L Thompson __syncthreads(); 1151c21e869SJeremy L Thompson // Update buffers used 1161c21e869SJeremy L Thompson pre /= Q; 1171c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 1181c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 1191c21e869SJeremy L Thompson const CeedInt writeLen = pre * post * P; 1201c21e869SJeremy L Thompson 1211c21e869SJeremy L Thompson // Contract along middle index 1221c21e869SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 1231c21e869SJeremy L Thompson const CeedInt c = k % post; 1241c21e869SJeremy L Thompson const CeedInt j = (k / post) % P; 1251c21e869SJeremy L Thompson const CeedInt a = k / (post * P); 1261c21e869SJeremy L Thompson CeedScalar v_k = 0; 1271c21e869SJeremy L Thompson 1281c21e869SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 1291c21e869SJeremy L Thompson out[k] = v_k; 1301c21e869SJeremy L Thompson } 1311c21e869SJeremy L Thompson post *= P; 1321c21e869SJeremy L Thompson } 1331c21e869SJeremy L Thompson } 1341c21e869SJeremy L Thompson } 1351c21e869SJeremy L Thompson } else { 1361c21e869SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 1371c21e869SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 1381c21e869SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 1391c21e869SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 1401c21e869SJeremy L Thompson CeedInt pre = u_size; 1411c21e869SJeremy L Thompson CeedInt post = 1; 1421c21e869SJeremy L Thompson 1431c21e869SJeremy L Thompson // Map to coefficients 1441c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 1451c21e869SJeremy L Thompson __syncthreads(); 1461c21e869SJeremy L Thompson // Update buffers used 1471c21e869SJeremy L Thompson pre /= P; 1481c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 1491c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 1501c21e869SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 1511c21e869SJeremy L Thompson 1521c21e869SJeremy L Thompson // Contract along middle index 1531c21e869SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 1541c21e869SJeremy L Thompson const CeedInt c = k % post; 1551c21e869SJeremy L Thompson const CeedInt j = (k / post) % Q; 1561c21e869SJeremy L Thompson const CeedInt a = k / (post * Q); 1571c21e869SJeremy L Thompson CeedScalar v_k = 0; 1581c21e869SJeremy L Thompson 1591c21e869SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 1601c21e869SJeremy L Thompson out[k] = v_k; 1611c21e869SJeremy L Thompson } 1621c21e869SJeremy L Thompson post *= Q; 1631c21e869SJeremy L Thompson } 1641c21e869SJeremy L Thompson 1651c21e869SJeremy L Thompson // Map to point 1661c21e869SJeremy L Thompson __syncthreads(); 167*2d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 1681c21e869SJeremy L Thompson pre = BASIS_NUM_QPTS; 1691c21e869SJeremy L Thompson post = 1; 1701c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 1711c21e869SJeremy L Thompson // Update buffers used 1721c21e869SJeremy L Thompson pre /= Q; 1731c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); 1741c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2); 1751c21e869SJeremy L Thompson 1761c21e869SJeremy L Thompson // Build Chebyshev polynomial values 1771c21e869SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); 1781c21e869SJeremy L Thompson 1791c21e869SJeremy L Thompson // Contract along middle index 1801c21e869SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 1811c21e869SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 1821c21e869SJeremy L Thompson CeedScalar v_k = 0; 1831c21e869SJeremy L Thompson 1841c21e869SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 1851c21e869SJeremy L Thompson out[a * post + c] = v_k; 1861c21e869SJeremy L Thompson } 1871c21e869SJeremy L Thompson } 1881c21e869SJeremy L Thompson post *= 1; 1891c21e869SJeremy L Thompson } 1901c21e869SJeremy L Thompson } 1911c21e869SJeremy L Thompson } 1921c21e869SJeremy L Thompson } 1931c21e869SJeremy L Thompson } 1941c21e869SJeremy L Thompson } 1951c21e869SJeremy L Thompson 1961c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 1971c21e869SJeremy L Thompson // Grad 1981c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 1991c21e869SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 2001c21e869SJeremy L Thompson const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 2011c21e869SJeremy L Thompson const CeedInt i = threadIdx.x; 2021c21e869SJeremy L Thompson 2031c21e869SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN]; 2041c21e869SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 2051c21e869SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 2061c21e869SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 2071c21e869SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 2081c21e869SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 2091c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 2101c21e869SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 2111c21e869SJeremy L Thompson } 2121c21e869SJeremy L Thompson 2131c21e869SJeremy L Thompson const CeedInt P = BASIS_P_1D; 2141c21e869SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 2151c21e869SJeremy L Thompson const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 2161c21e869SJeremy L Thompson const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 2171c21e869SJeremy L Thompson const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 2181c21e869SJeremy L Thompson const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 2191c21e869SJeremy L Thompson const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 2201c21e869SJeremy L Thompson const CeedInt u_dim_stride = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0; 2211c21e869SJeremy L Thompson const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 2221c21e869SJeremy L Thompson 2231c21e869SJeremy L Thompson // Apply basis element by element 2241c21e869SJeremy L Thompson if (is_transpose) { 2251c21e869SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 2261c21e869SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 2271c21e869SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 2281c21e869SJeremy L Thompson CeedInt pre = 1; 2291c21e869SJeremy L Thompson CeedInt post = 1; 2301c21e869SJeremy L Thompson 2311c21e869SJeremy L Thompson // Clear Chebyshev coeffs 2321c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 2331c21e869SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 2341c21e869SJeremy L Thompson } 2351c21e869SJeremy L Thompson 2361c21e869SJeremy L Thompson // Map from point 237*2d10e82cSJeremy L Thompson __syncthreads(); 238*2d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 2391c21e869SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 2401c21e869SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; 2411c21e869SJeremy L Thompson 2421c21e869SJeremy L Thompson pre = 1; 2431c21e869SJeremy L Thompson post = 1; 2441c21e869SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 2451c21e869SJeremy L Thompson // Update buffers used 2461c21e869SJeremy L Thompson pre /= 1; 2471c21e869SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); 2481c21e869SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); 2491c21e869SJeremy L Thompson 2501c21e869SJeremy L Thompson // Build Chebyshev polynomial values 2511c21e869SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 2521c21e869SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 2531c21e869SJeremy L Thompson 2541c21e869SJeremy L Thompson // Contract along middle index 2551c21e869SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 2561c21e869SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 2571c21e869SJeremy L Thompson if (dim_2 == BASIS_DIM - 1) { 258*2d10e82cSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + j) * post + c], chebyshev_x[j] * in[a * post + c]); 2591c21e869SJeremy L Thompson } else { 2601c21e869SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 2611c21e869SJeremy L Thompson } 2621c21e869SJeremy L Thompson } 2631c21e869SJeremy L Thompson } 2641c21e869SJeremy L Thompson post *= Q; 2651c21e869SJeremy L Thompson } 2661c21e869SJeremy L Thompson } 2671c21e869SJeremy L Thompson } 2681c21e869SJeremy L Thompson 2691c21e869SJeremy L Thompson // Map from coefficients 2701c21e869SJeremy L Thompson pre = BASIS_NUM_QPTS; 2711c21e869SJeremy L Thompson post = 1; 2721c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 2731c21e869SJeremy L Thompson __syncthreads(); 2741c21e869SJeremy L Thompson // Update buffers used 2751c21e869SJeremy L Thompson pre /= Q; 2761c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 2771c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 2781c21e869SJeremy L Thompson const CeedInt writeLen = pre * post * P; 2791c21e869SJeremy L Thompson 2801c21e869SJeremy L Thompson // Contract along middle index 2811c21e869SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 2821c21e869SJeremy L Thompson const CeedInt c = k % post; 2831c21e869SJeremy L Thompson const CeedInt j = (k / post) % P; 2841c21e869SJeremy L Thompson const CeedInt a = k / (post * P); 2851c21e869SJeremy L Thompson CeedScalar v_k = 0; 2861c21e869SJeremy L Thompson 2871c21e869SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 2881c21e869SJeremy L Thompson out[k] = v_k; 2891c21e869SJeremy L Thompson } 2901c21e869SJeremy L Thompson post *= P; 2911c21e869SJeremy L Thompson } 2921c21e869SJeremy L Thompson } 2931c21e869SJeremy L Thompson } 2941c21e869SJeremy L Thompson } else { 2951c21e869SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 2961c21e869SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 2971c21e869SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 2981c21e869SJeremy L Thompson CeedInt pre = u_size; 2991c21e869SJeremy L Thompson CeedInt post = 1; 3001c21e869SJeremy L Thompson 3011c21e869SJeremy L Thompson // Map to coefficients 3021c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 3031c21e869SJeremy L Thompson __syncthreads(); 3041c21e869SJeremy L Thompson // Update buffers used 3051c21e869SJeremy L Thompson pre /= P; 3061c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 3071c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 3081c21e869SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 3091c21e869SJeremy L Thompson 3101c21e869SJeremy L Thompson // Contract along middle index 3111c21e869SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 3121c21e869SJeremy L Thompson const CeedInt c = k % post; 3131c21e869SJeremy L Thompson const CeedInt j = (k / post) % Q; 3141c21e869SJeremy L Thompson const CeedInt a = k / (post * Q); 3151c21e869SJeremy L Thompson CeedScalar v_k = 0; 3161c21e869SJeremy L Thompson 3171c21e869SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 3181c21e869SJeremy L Thompson out[k] = v_k; 3191c21e869SJeremy L Thompson } 3201c21e869SJeremy L Thompson post *= Q; 3211c21e869SJeremy L Thompson } 3221c21e869SJeremy L Thompson 3231c21e869SJeremy L Thompson // Map to point 3241c21e869SJeremy L Thompson __syncthreads(); 325*2d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 3261c21e869SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 3271c21e869SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; 3281c21e869SJeremy L Thompson 3291c21e869SJeremy L Thompson pre = BASIS_NUM_QPTS; 3301c21e869SJeremy L Thompson post = 1; 3311c21e869SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 3321c21e869SJeremy L Thompson // Update buffers used 3331c21e869SJeremy L Thompson pre /= Q; 3341c21e869SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); 3351c21e869SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); 3361c21e869SJeremy L Thompson 3371c21e869SJeremy L Thompson // Build Chebyshev polynomial values 3381c21e869SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 3391c21e869SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 3401c21e869SJeremy L Thompson 3411c21e869SJeremy L Thompson // Contract along middle index 3421c21e869SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 3431c21e869SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 3441c21e869SJeremy L Thompson CeedScalar v_k = 0; 3451c21e869SJeremy L Thompson 3461c21e869SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 3471c21e869SJeremy L Thompson out[a * post + c] = v_k; 3481c21e869SJeremy L Thompson } 3491c21e869SJeremy L Thompson } 3501c21e869SJeremy L Thompson post *= 1; 3511c21e869SJeremy L Thompson } 3521c21e869SJeremy L Thompson } 3531c21e869SJeremy L Thompson } 3541c21e869SJeremy L Thompson } 3551c21e869SJeremy L Thompson } 3561c21e869SJeremy L Thompson } 3571c21e869SJeremy L Thompson } 358