1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 234d14614SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 334d14614SJeremy L Thompson // 434d14614SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 534d14614SJeremy L Thompson // 634d14614SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 734d14614SJeremy L Thompson 834d14614SJeremy L Thompson /// @file 934d14614SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation 10c0b5abf0SJeremy L Thompson #include <ceed/types.h> 1134d14614SJeremy L Thompson 1234d14614SJeremy L Thompson //------------------------------------------------------------------------------ 1334d14614SJeremy L Thompson // Chebyshev values 1434d14614SJeremy L Thompson //------------------------------------------------------------------------------ 1534d14614SJeremy L Thompson template <int Q_1D> 1634d14614SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 1734d14614SJeremy L Thompson chebyshev_x[0] = 1.0; 1834d14614SJeremy L Thompson chebyshev_x[1] = 2 * x; 1934d14614SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 2034d14614SJeremy L Thompson } 2134d14614SJeremy L Thompson 2234d14614SJeremy L Thompson template <int Q_1D> 2334d14614SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 2434d14614SJeremy L Thompson CeedScalar chebyshev_x[3]; 2534d14614SJeremy L Thompson 2634d14614SJeremy L Thompson chebyshev_x[1] = 1.0; 2734d14614SJeremy L Thompson chebyshev_x[2] = 2 * x; 2834d14614SJeremy L Thompson chebyshev_dx[0] = 0.0; 2934d14614SJeremy L Thompson chebyshev_dx[1] = 2.0; 3034d14614SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 3180c135a8SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 3280c135a8SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 3334d14614SJeremy L Thompson } 3434d14614SJeremy L Thompson } 3534d14614SJeremy L Thompson 3634d14614SJeremy L Thompson //------------------------------------------------------------------------------ 3734d14614SJeremy L Thompson // Tensor Basis Kernels AtPoints 3834d14614SJeremy L Thompson //------------------------------------------------------------------------------ 3934d14614SJeremy L Thompson 4034d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4134d14614SJeremy L Thompson // Interp 4234d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4381ae6159SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 44111870feSJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 45111870feSJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 4634d14614SJeremy L Thompson const CeedInt i = threadIdx.x; 4734d14614SJeremy L Thompson 48f7c9815fSJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 4934d14614SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 5034d14614SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 5134d14614SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 5234d14614SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 5334d14614SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 5434d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 5534d14614SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 5634d14614SJeremy L Thompson } 5734d14614SJeremy L Thompson 5834d14614SJeremy L Thompson const CeedInt P = BASIS_P_1D; 5934d14614SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 6081ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_NODES; 6181ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_PTS; 6281ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; 6381ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; 6481ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_NODES; 6534d14614SJeremy L Thompson 6634d14614SJeremy L Thompson // Apply basis element by element 6781ae6159SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 6881ae6159SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 6981ae6159SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 7081ae6159SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 7181ae6159SJeremy L Thompson CeedInt pre = u_size; 7281ae6159SJeremy L Thompson CeedInt post = 1; 7381ae6159SJeremy L Thompson 7481ae6159SJeremy L Thompson // Map to coefficients 7581ae6159SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 7681ae6159SJeremy L Thompson __syncthreads(); 7781ae6159SJeremy L Thompson // Update buffers used 7881ae6159SJeremy L Thompson pre /= P; 7981ae6159SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 8081ae6159SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 8181ae6159SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 8281ae6159SJeremy L Thompson 8381ae6159SJeremy L Thompson // Contract along middle index 8481ae6159SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 8581ae6159SJeremy L Thompson const CeedInt c = k % post; 8681ae6159SJeremy L Thompson const CeedInt j = (k / post) % Q; 8781ae6159SJeremy L Thompson const CeedInt a = k / (post * Q); 8881ae6159SJeremy L Thompson CeedScalar v_k = 0; 8981ae6159SJeremy L Thompson 9081ae6159SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 9181ae6159SJeremy L Thompson out[k] = v_k; 9281ae6159SJeremy L Thompson } 9381ae6159SJeremy L Thompson post *= Q; 9481ae6159SJeremy L Thompson } 9581ae6159SJeremy L Thompson 9681ae6159SJeremy L Thompson // Map to point 9781ae6159SJeremy L Thompson __syncthreads(); 9881ae6159SJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 9981ae6159SJeremy L Thompson pre = BASIS_NUM_QPTS; 10081ae6159SJeremy L Thompson post = 1; 10181ae6159SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 10281ae6159SJeremy L Thompson // Update buffers used 10381ae6159SJeremy L Thompson pre /= Q; 10481ae6159SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); 10581ae6159SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); 10681ae6159SJeremy L Thompson 10781ae6159SJeremy L Thompson // Build Chebyshev polynomial values 10881ae6159SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); 10981ae6159SJeremy L Thompson 11081ae6159SJeremy L Thompson // Contract along middle index 11181ae6159SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 11281ae6159SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 11381ae6159SJeremy L Thompson CeedScalar v_k = 0; 11481ae6159SJeremy L Thompson 11581ae6159SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 11681ae6159SJeremy L Thompson out[a * post + c] = v_k; 11781ae6159SJeremy L Thompson } 11881ae6159SJeremy L Thompson } 11981ae6159SJeremy L Thompson post *= 1; 12081ae6159SJeremy L Thompson } 12181ae6159SJeremy L Thompson } 12281ae6159SJeremy L Thompson } 12381ae6159SJeremy L Thompson } 12481ae6159SJeremy L Thompson } 12581ae6159SJeremy L Thompson 12681ae6159SJeremy L Thompson extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 12781ae6159SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 12881ae6159SJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 12981ae6159SJeremy L Thompson const CeedInt i = threadIdx.x; 13081ae6159SJeremy L Thompson 13181ae6159SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 13281ae6159SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 13381ae6159SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 13481ae6159SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 13581ae6159SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 13681ae6159SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 13781ae6159SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 13881ae6159SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 13981ae6159SJeremy L Thompson } 14081ae6159SJeremy L Thompson 14181ae6159SJeremy L Thompson const CeedInt P = BASIS_P_1D; 14281ae6159SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 14381ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_PTS; 14481ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_NODES; 14581ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; 14681ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; 14781ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_PTS; 14881ae6159SJeremy L Thompson 14981ae6159SJeremy L Thompson // Apply basis element by element 15034d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 15134d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 152db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 153db2becc9SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 15434d14614SJeremy L Thompson CeedInt pre = 1; 15534d14614SJeremy L Thompson CeedInt post = 1; 15634d14614SJeremy L Thompson 15734d14614SJeremy L Thompson // Clear Chebyshev coeffs 15834d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 15934d14614SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 16034d14614SJeremy L Thompson } 16134d14614SJeremy L Thompson 16234d14614SJeremy L Thompson // Map from point 1632d10e82cSJeremy L Thompson __syncthreads(); 1642d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 165111870feSJeremy L Thompson if (p >= points_per_elem[elem]) continue; 16634d14614SJeremy L Thompson pre = 1; 16734d14614SJeremy L Thompson post = 1; 16834d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 16934d14614SJeremy L Thompson // Update buffers used 17034d14614SJeremy L Thompson pre /= 1; 171db2becc9SJeremy L Thompson const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); 17234d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); 17334d14614SJeremy L Thompson 17434d14614SJeremy L Thompson // Build Chebyshev polynomial values 17534d14614SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); 17634d14614SJeremy L Thompson 17734d14614SJeremy L Thompson // Contract along middle index 17834d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 17934d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 18034d14614SJeremy L Thompson if (d == BASIS_DIM - 1) { 181ad8059fcSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 18234d14614SJeremy L Thompson } else { 18334d14614SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 18434d14614SJeremy L Thompson } 18534d14614SJeremy L Thompson } 18634d14614SJeremy L Thompson } 18734d14614SJeremy L Thompson post *= Q; 18834d14614SJeremy L Thompson } 18934d14614SJeremy L Thompson } 19034d14614SJeremy L Thompson 19134d14614SJeremy L Thompson // Map from coefficients 19234d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 19334d14614SJeremy L Thompson post = 1; 19434d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 19534d14614SJeremy L Thompson __syncthreads(); 19634d14614SJeremy L Thompson // Update buffers used 19734d14614SJeremy L Thompson pre /= Q; 19834d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 19934d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 20034d14614SJeremy L Thompson const CeedInt writeLen = pre * post * P; 20134d14614SJeremy L Thompson 20234d14614SJeremy L Thompson // Contract along middle index 20334d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 20434d14614SJeremy L Thompson const CeedInt c = k % post; 20534d14614SJeremy L Thompson const CeedInt j = (k / post) % P; 20634d14614SJeremy L Thompson const CeedInt a = k / (post * P); 20734d14614SJeremy L Thompson CeedScalar v_k = 0; 20834d14614SJeremy L Thompson 20934d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 210db2becc9SJeremy L Thompson if (d == BASIS_DIM - 1) out[k] += v_k; 211db2becc9SJeremy L Thompson else out[k] = v_k; 21234d14614SJeremy L Thompson } 21334d14614SJeremy L Thompson post *= P; 21434d14614SJeremy L Thompson } 21534d14614SJeremy L Thompson } 21634d14614SJeremy L Thompson } 21781ae6159SJeremy L Thompson } 21881ae6159SJeremy L Thompson 21981ae6159SJeremy L Thompson //------------------------------------------------------------------------------ 22081ae6159SJeremy L Thompson // Grad 22181ae6159SJeremy L Thompson //------------------------------------------------------------------------------ 22281ae6159SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 22381ae6159SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 22481ae6159SJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 22581ae6159SJeremy L Thompson const CeedInt i = threadIdx.x; 22681ae6159SJeremy L Thompson 22781ae6159SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 22881ae6159SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 22981ae6159SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 23081ae6159SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 23181ae6159SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 23281ae6159SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 23381ae6159SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 23481ae6159SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 23581ae6159SJeremy L Thompson } 23681ae6159SJeremy L Thompson 23781ae6159SJeremy L Thompson const CeedInt P = BASIS_P_1D; 23881ae6159SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 23981ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_NODES; 24081ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_PTS; 24181ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; 24281ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; 24381ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_NODES; 24481ae6159SJeremy L Thompson const CeedInt u_dim_stride = 0; 24581ae6159SJeremy L Thompson const CeedInt v_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 24681ae6159SJeremy L Thompson 24781ae6159SJeremy L Thompson // Apply basis element by element 24834d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 24934d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 250db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 25134d14614SJeremy L Thompson CeedInt pre = u_size; 25234d14614SJeremy L Thompson CeedInt post = 1; 25334d14614SJeremy L Thompson 25434d14614SJeremy L Thompson // Map to coefficients 25534d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 25634d14614SJeremy L Thompson __syncthreads(); 25734d14614SJeremy L Thompson // Update buffers used 25834d14614SJeremy L Thompson pre /= P; 25934d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 26034d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 26134d14614SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 26234d14614SJeremy L Thompson 26334d14614SJeremy L Thompson // Contract along middle index 26434d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 26534d14614SJeremy L Thompson const CeedInt c = k % post; 26634d14614SJeremy L Thompson const CeedInt j = (k / post) % Q; 26734d14614SJeremy L Thompson const CeedInt a = k / (post * Q); 26834d14614SJeremy L Thompson CeedScalar v_k = 0; 26934d14614SJeremy L Thompson 27034d14614SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 27134d14614SJeremy L Thompson out[k] = v_k; 27234d14614SJeremy L Thompson } 27334d14614SJeremy L Thompson post *= Q; 27434d14614SJeremy L Thompson } 27534d14614SJeremy L Thompson 27634d14614SJeremy L Thompson // Map to point 27734d14614SJeremy L Thompson __syncthreads(); 2782d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 27981ae6159SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 28081ae6159SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; 28181ae6159SJeremy L Thompson 28234d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 28334d14614SJeremy L Thompson post = 1; 28481ae6159SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 28534d14614SJeremy L Thompson // Update buffers used 28634d14614SJeremy L Thompson pre /= Q; 28781ae6159SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); 28881ae6159SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); 28934d14614SJeremy L Thompson 29034d14614SJeremy L Thompson // Build Chebyshev polynomial values 29181ae6159SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 29281ae6159SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 29334d14614SJeremy L Thompson 29434d14614SJeremy L Thompson // Contract along middle index 29534d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 29634d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 29734d14614SJeremy L Thompson CeedScalar v_k = 0; 29834d14614SJeremy L Thompson 29934d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 30034d14614SJeremy L Thompson out[a * post + c] = v_k; 30134d14614SJeremy L Thompson } 30234d14614SJeremy L Thompson } 30334d14614SJeremy L Thompson post *= 1; 30434d14614SJeremy L Thompson } 30534d14614SJeremy L Thompson } 30634d14614SJeremy L Thompson } 30734d14614SJeremy L Thompson } 30834d14614SJeremy L Thompson } 30934d14614SJeremy L Thompson } 31034d14614SJeremy L Thompson 31181ae6159SJeremy L Thompson extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 312111870feSJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 313111870feSJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 31434d14614SJeremy L Thompson const CeedInt i = threadIdx.x; 31534d14614SJeremy L Thompson 316f7c9815fSJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 31734d14614SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 31834d14614SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 31934d14614SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 32034d14614SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 32134d14614SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 32234d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 32334d14614SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 32434d14614SJeremy L Thompson } 32534d14614SJeremy L Thompson 32634d14614SJeremy L Thompson const CeedInt P = BASIS_P_1D; 32734d14614SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 32881ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_PTS; 32981ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_NODES; 33081ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; 33181ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; 33281ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_PTS; 33381ae6159SJeremy L Thompson const CeedInt u_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 33481ae6159SJeremy L Thompson const CeedInt v_dim_stride = 0; 33534d14614SJeremy L Thompson 33634d14614SJeremy L Thompson // Apply basis element by element 33734d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 33834d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 339db2becc9SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 34034d14614SJeremy L Thompson CeedInt pre = 1; 34134d14614SJeremy L Thompson CeedInt post = 1; 34234d14614SJeremy L Thompson 34334d14614SJeremy L Thompson // Clear Chebyshev coeffs 34434d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 34534d14614SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 34634d14614SJeremy L Thompson } 34734d14614SJeremy L Thompson 34834d14614SJeremy L Thompson // Map from point 3492d10e82cSJeremy L Thompson __syncthreads(); 3502d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 351111870feSJeremy L Thompson if (p >= points_per_elem[elem]) continue; 35234d14614SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 353db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; 35434d14614SJeremy L Thompson 35534d14614SJeremy L Thompson pre = 1; 35634d14614SJeremy L Thompson post = 1; 35734d14614SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 35834d14614SJeremy L Thompson // Update buffers used 35934d14614SJeremy L Thompson pre /= 1; 36034d14614SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); 36134d14614SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); 36234d14614SJeremy L Thompson 36334d14614SJeremy L Thompson // Build Chebyshev polynomial values 36434d14614SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 36534d14614SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 36634d14614SJeremy L Thompson 36734d14614SJeremy L Thompson // Contract along middle index 36834d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 36934d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 37034d14614SJeremy L Thompson if (dim_2 == BASIS_DIM - 1) { 371ad8059fcSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 37234d14614SJeremy L Thompson } else { 37334d14614SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 37434d14614SJeremy L Thompson } 37534d14614SJeremy L Thompson } 37634d14614SJeremy L Thompson } 37734d14614SJeremy L Thompson post *= Q; 37834d14614SJeremy L Thompson } 37934d14614SJeremy L Thompson } 38034d14614SJeremy L Thompson } 38134d14614SJeremy L Thompson 38234d14614SJeremy L Thompson // Map from coefficients 38334d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 38434d14614SJeremy L Thompson post = 1; 38534d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 38634d14614SJeremy L Thompson __syncthreads(); 38734d14614SJeremy L Thompson // Update buffers used 38834d14614SJeremy L Thompson pre /= Q; 38934d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 39034d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 39134d14614SJeremy L Thompson const CeedInt writeLen = pre * post * P; 39234d14614SJeremy L Thompson 39334d14614SJeremy L Thompson // Contract along middle index 39434d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 39534d14614SJeremy L Thompson const CeedInt c = k % post; 39634d14614SJeremy L Thompson const CeedInt j = (k / post) % P; 39734d14614SJeremy L Thompson const CeedInt a = k / (post * P); 39834d14614SJeremy L Thompson CeedScalar v_k = 0; 39934d14614SJeremy L Thompson 40034d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 401db2becc9SJeremy L Thompson if (d == BASIS_DIM - 1) out[k] += v_k; 402db2becc9SJeremy L Thompson else out[k] = v_k; 40334d14614SJeremy L Thompson } 40434d14614SJeremy L Thompson post *= P; 40534d14614SJeremy L Thompson } 40634d14614SJeremy L Thompson } 40734d14614SJeremy L Thompson } 40834d14614SJeremy L Thompson } 409