134d14614SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 234d14614SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 334d14614SJeremy L Thompson // 434d14614SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 534d14614SJeremy L Thompson // 634d14614SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 734d14614SJeremy L Thompson 834d14614SJeremy L Thompson /// @file 934d14614SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation 1034d14614SJeremy L Thompson 11*c0b5abf0SJeremy L Thompson #include <ceed/types.h> 1234d14614SJeremy L Thompson 1334d14614SJeremy L Thompson //------------------------------------------------------------------------------ 1434d14614SJeremy L Thompson // Chebyshev values 1534d14614SJeremy L Thompson //------------------------------------------------------------------------------ 1634d14614SJeremy L Thompson template <int Q_1D> 1734d14614SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 1834d14614SJeremy L Thompson chebyshev_x[0] = 1.0; 1934d14614SJeremy L Thompson chebyshev_x[1] = 2 * x; 2034d14614SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 2134d14614SJeremy L Thompson } 2234d14614SJeremy L Thompson 2334d14614SJeremy L Thompson template <int Q_1D> 2434d14614SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 2534d14614SJeremy L Thompson CeedScalar chebyshev_x[3]; 2634d14614SJeremy L Thompson 2734d14614SJeremy L Thompson chebyshev_x[1] = 1.0; 2834d14614SJeremy L Thompson chebyshev_x[2] = 2 * x; 2934d14614SJeremy L Thompson chebyshev_dx[0] = 0.0; 3034d14614SJeremy L Thompson chebyshev_dx[1] = 2.0; 3134d14614SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 3280c135a8SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 3380c135a8SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 3434d14614SJeremy L Thompson } 3534d14614SJeremy L Thompson } 3634d14614SJeremy L Thompson 3734d14614SJeremy L Thompson //------------------------------------------------------------------------------ 3834d14614SJeremy L Thompson // Tensor Basis Kernels AtPoints 3934d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4034d14614SJeremy L Thompson 4134d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4234d14614SJeremy L Thompson // Interp 4334d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4434d14614SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 45111870feSJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 46111870feSJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 4734d14614SJeremy L Thompson const CeedInt i = threadIdx.x; 4834d14614SJeremy L Thompson 49f7c9815fSJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 5034d14614SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 5134d14614SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 5234d14614SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 5334d14614SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 5434d14614SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 5534d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 5634d14614SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 5734d14614SJeremy L Thompson } 5834d14614SJeremy L Thompson 5934d14614SJeremy L Thompson const CeedInt P = BASIS_P_1D; 6034d14614SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 6134d14614SJeremy L Thompson const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 6234d14614SJeremy L Thompson const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 6334d14614SJeremy L Thompson const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 6434d14614SJeremy L Thompson const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 6534d14614SJeremy L Thompson const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 6634d14614SJeremy L Thompson 6734d14614SJeremy L Thompson // Apply basis element by element 6834d14614SJeremy L Thompson if (is_transpose) { 6934d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 7034d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 71db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 72db2becc9SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 7334d14614SJeremy L Thompson CeedInt pre = 1; 7434d14614SJeremy L Thompson CeedInt post = 1; 7534d14614SJeremy L Thompson 7634d14614SJeremy L Thompson // Clear Chebyshev coeffs 7734d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 7834d14614SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 7934d14614SJeremy L Thompson } 8034d14614SJeremy L Thompson 8134d14614SJeremy L Thompson // Map from point 822d10e82cSJeremy L Thompson __syncthreads(); 832d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 84111870feSJeremy L Thompson if (p >= points_per_elem[elem]) continue; 8534d14614SJeremy L Thompson pre = 1; 8634d14614SJeremy L Thompson post = 1; 8734d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 8834d14614SJeremy L Thompson // Update buffers used 8934d14614SJeremy L Thompson pre /= 1; 90db2becc9SJeremy L Thompson const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); 9134d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); 9234d14614SJeremy L Thompson 9334d14614SJeremy L Thompson // Build Chebyshev polynomial values 9434d14614SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); 9534d14614SJeremy L Thompson 9634d14614SJeremy L Thompson // Contract along middle index 9734d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 9834d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 9934d14614SJeremy L Thompson if (d == BASIS_DIM - 1) { 100ad8059fcSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 10134d14614SJeremy L Thompson } else { 10234d14614SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 10334d14614SJeremy L Thompson } 10434d14614SJeremy L Thompson } 10534d14614SJeremy L Thompson } 10634d14614SJeremy L Thompson post *= Q; 10734d14614SJeremy L Thompson } 10834d14614SJeremy L Thompson } 10934d14614SJeremy L Thompson 11034d14614SJeremy L Thompson // Map from coefficients 11134d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 11234d14614SJeremy L Thompson post = 1; 11334d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 11434d14614SJeremy L Thompson __syncthreads(); 11534d14614SJeremy L Thompson // Update buffers used 11634d14614SJeremy L Thompson pre /= Q; 11734d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 11834d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 11934d14614SJeremy L Thompson const CeedInt writeLen = pre * post * P; 12034d14614SJeremy L Thompson 12134d14614SJeremy L Thompson // Contract along middle index 12234d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 12334d14614SJeremy L Thompson const CeedInt c = k % post; 12434d14614SJeremy L Thompson const CeedInt j = (k / post) % P; 12534d14614SJeremy L Thompson const CeedInt a = k / (post * P); 12634d14614SJeremy L Thompson CeedScalar v_k = 0; 12734d14614SJeremy L Thompson 12834d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 129db2becc9SJeremy L Thompson if (d == BASIS_DIM - 1) out[k] += v_k; 130db2becc9SJeremy L Thompson else out[k] = v_k; 13134d14614SJeremy L Thompson } 13234d14614SJeremy L Thompson post *= P; 13334d14614SJeremy L Thompson } 13434d14614SJeremy L Thompson } 13534d14614SJeremy L Thompson } 13634d14614SJeremy L Thompson } else { 13734d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 13834d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 139db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 140db2becc9SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 14134d14614SJeremy L Thompson CeedInt pre = u_size; 14234d14614SJeremy L Thompson CeedInt post = 1; 14334d14614SJeremy L Thompson 14434d14614SJeremy L Thompson // Map to coefficients 14534d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 14634d14614SJeremy L Thompson __syncthreads(); 14734d14614SJeremy L Thompson // Update buffers used 14834d14614SJeremy L Thompson pre /= P; 14934d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 15034d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 15134d14614SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 15234d14614SJeremy L Thompson 15334d14614SJeremy L Thompson // Contract along middle index 15434d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 15534d14614SJeremy L Thompson const CeedInt c = k % post; 15634d14614SJeremy L Thompson const CeedInt j = (k / post) % Q; 15734d14614SJeremy L Thompson const CeedInt a = k / (post * Q); 15834d14614SJeremy L Thompson CeedScalar v_k = 0; 15934d14614SJeremy L Thompson 16034d14614SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 16134d14614SJeremy L Thompson out[k] = v_k; 16234d14614SJeremy L Thompson } 16334d14614SJeremy L Thompson post *= Q; 16434d14614SJeremy L Thompson } 16534d14614SJeremy L Thompson 16634d14614SJeremy L Thompson // Map to point 16734d14614SJeremy L Thompson __syncthreads(); 1682d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 16934d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 17034d14614SJeremy L Thompson post = 1; 17134d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 17234d14614SJeremy L Thompson // Update buffers used 17334d14614SJeremy L Thompson pre /= Q; 17434d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); 175db2becc9SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); 17634d14614SJeremy L Thompson 17734d14614SJeremy L Thompson // Build Chebyshev polynomial values 17834d14614SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); 17934d14614SJeremy L Thompson 18034d14614SJeremy L Thompson // Contract along middle index 18134d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 18234d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 18334d14614SJeremy L Thompson CeedScalar v_k = 0; 18434d14614SJeremy L Thompson 18534d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 18634d14614SJeremy L Thompson out[a * post + c] = v_k; 18734d14614SJeremy L Thompson } 18834d14614SJeremy L Thompson } 18934d14614SJeremy L Thompson post *= 1; 19034d14614SJeremy L Thompson } 19134d14614SJeremy L Thompson } 19234d14614SJeremy L Thompson } 19334d14614SJeremy L Thompson } 19434d14614SJeremy L Thompson } 19534d14614SJeremy L Thompson } 19634d14614SJeremy L Thompson 19734d14614SJeremy L Thompson //------------------------------------------------------------------------------ 19834d14614SJeremy L Thompson // Grad 19934d14614SJeremy L Thompson //------------------------------------------------------------------------------ 20034d14614SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 201111870feSJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 202111870feSJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 20334d14614SJeremy L Thompson const CeedInt i = threadIdx.x; 20434d14614SJeremy L Thompson 205f7c9815fSJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 20634d14614SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 20734d14614SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 20834d14614SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 20934d14614SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 21034d14614SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 21134d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 21234d14614SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 21334d14614SJeremy L Thompson } 21434d14614SJeremy L Thompson 21534d14614SJeremy L Thompson const CeedInt P = BASIS_P_1D; 21634d14614SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 21734d14614SJeremy L Thompson const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 21834d14614SJeremy L Thompson const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 21934d14614SJeremy L Thompson const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 22034d14614SJeremy L Thompson const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 22134d14614SJeremy L Thompson const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 22234d14614SJeremy L Thompson const CeedInt u_dim_stride = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0; 22334d14614SJeremy L Thompson const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 22434d14614SJeremy L Thompson 22534d14614SJeremy L Thompson // Apply basis element by element 22634d14614SJeremy L Thompson if (is_transpose) { 22734d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 22834d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 229db2becc9SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 23034d14614SJeremy L Thompson CeedInt pre = 1; 23134d14614SJeremy L Thompson CeedInt post = 1; 23234d14614SJeremy L Thompson 23334d14614SJeremy L Thompson // Clear Chebyshev coeffs 23434d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 23534d14614SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 23634d14614SJeremy L Thompson } 23734d14614SJeremy L Thompson 23834d14614SJeremy L Thompson // Map from point 2392d10e82cSJeremy L Thompson __syncthreads(); 2402d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 241111870feSJeremy L Thompson if (p >= points_per_elem[elem]) continue; 24234d14614SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 243db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; 24434d14614SJeremy L Thompson 24534d14614SJeremy L Thompson pre = 1; 24634d14614SJeremy L Thompson post = 1; 24734d14614SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 24834d14614SJeremy L Thompson // Update buffers used 24934d14614SJeremy L Thompson pre /= 1; 25034d14614SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); 25134d14614SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); 25234d14614SJeremy L Thompson 25334d14614SJeremy L Thompson // Build Chebyshev polynomial values 25434d14614SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 25534d14614SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 25634d14614SJeremy L Thompson 25734d14614SJeremy L Thompson // Contract along middle index 25834d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 25934d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 26034d14614SJeremy L Thompson if (dim_2 == BASIS_DIM - 1) { 261ad8059fcSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 26234d14614SJeremy L Thompson } else { 26334d14614SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 26434d14614SJeremy L Thompson } 26534d14614SJeremy L Thompson } 26634d14614SJeremy L Thompson } 26734d14614SJeremy L Thompson post *= Q; 26834d14614SJeremy L Thompson } 26934d14614SJeremy L Thompson } 27034d14614SJeremy L Thompson } 27134d14614SJeremy L Thompson 27234d14614SJeremy L Thompson // Map from coefficients 27334d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 27434d14614SJeremy L Thompson post = 1; 27534d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 27634d14614SJeremy L Thompson __syncthreads(); 27734d14614SJeremy L Thompson // Update buffers used 27834d14614SJeremy L Thompson pre /= Q; 27934d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 28034d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 28134d14614SJeremy L Thompson const CeedInt writeLen = pre * post * P; 28234d14614SJeremy L Thompson 28334d14614SJeremy L Thompson // Contract along middle index 28434d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 28534d14614SJeremy L Thompson const CeedInt c = k % post; 28634d14614SJeremy L Thompson const CeedInt j = (k / post) % P; 28734d14614SJeremy L Thompson const CeedInt a = k / (post * P); 28834d14614SJeremy L Thompson CeedScalar v_k = 0; 28934d14614SJeremy L Thompson 29034d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 291db2becc9SJeremy L Thompson if (d == BASIS_DIM - 1) out[k] += v_k; 292db2becc9SJeremy L Thompson else out[k] = v_k; 29334d14614SJeremy L Thompson } 29434d14614SJeremy L Thompson post *= P; 29534d14614SJeremy L Thompson } 29634d14614SJeremy L Thompson } 29734d14614SJeremy L Thompson } 29834d14614SJeremy L Thompson } else { 29934d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 30034d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 301db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 30234d14614SJeremy L Thompson CeedInt pre = u_size; 30334d14614SJeremy L Thompson CeedInt post = 1; 30434d14614SJeremy L Thompson 30534d14614SJeremy L Thompson // Map to coefficients 30634d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 30734d14614SJeremy L Thompson __syncthreads(); 30834d14614SJeremy L Thompson // Update buffers used 30934d14614SJeremy L Thompson pre /= P; 31034d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 31134d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 31234d14614SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 31334d14614SJeremy L Thompson 31434d14614SJeremy L Thompson // Contract along middle index 31534d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 31634d14614SJeremy L Thompson const CeedInt c = k % post; 31734d14614SJeremy L Thompson const CeedInt j = (k / post) % Q; 31834d14614SJeremy L Thompson const CeedInt a = k / (post * Q); 31934d14614SJeremy L Thompson CeedScalar v_k = 0; 32034d14614SJeremy L Thompson 32134d14614SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 32234d14614SJeremy L Thompson out[k] = v_k; 32334d14614SJeremy L Thompson } 32434d14614SJeremy L Thompson post *= Q; 32534d14614SJeremy L Thompson } 32634d14614SJeremy L Thompson 32734d14614SJeremy L Thompson // Map to point 32834d14614SJeremy L Thompson __syncthreads(); 3292d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 33034d14614SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 331db2becc9SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; 33234d14614SJeremy L Thompson 33334d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 33434d14614SJeremy L Thompson post = 1; 33534d14614SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 33634d14614SJeremy L Thompson // Update buffers used 33734d14614SJeremy L Thompson pre /= Q; 33834d14614SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); 33934d14614SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); 34034d14614SJeremy L Thompson 34134d14614SJeremy L Thompson // Build Chebyshev polynomial values 34234d14614SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 34334d14614SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 34434d14614SJeremy L Thompson 34534d14614SJeremy L Thompson // Contract along middle index 34634d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 34734d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 34834d14614SJeremy L Thompson CeedScalar v_k = 0; 34934d14614SJeremy L Thompson 35034d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 35134d14614SJeremy L Thompson out[a * post + c] = v_k; 35234d14614SJeremy L Thompson } 35334d14614SJeremy L Thompson } 35434d14614SJeremy L Thompson post *= 1; 35534d14614SJeremy L Thompson } 35634d14614SJeremy L Thompson } 35734d14614SJeremy L Thompson } 35834d14614SJeremy L Thompson } 35934d14614SJeremy L Thompson } 36034d14614SJeremy L Thompson } 36134d14614SJeremy L Thompson } 362