134d14614SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 234d14614SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 334d14614SJeremy L Thompson // 434d14614SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 534d14614SJeremy L Thompson // 634d14614SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 734d14614SJeremy L Thompson 834d14614SJeremy L Thompson /// @file 934d14614SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation 10c0b5abf0SJeremy L Thompson #include <ceed/types.h> 1134d14614SJeremy L Thompson 1234d14614SJeremy L Thompson //------------------------------------------------------------------------------ 1334d14614SJeremy L Thompson // Chebyshev values 1434d14614SJeremy L Thompson //------------------------------------------------------------------------------ 1534d14614SJeremy L Thompson template <int Q_1D> 1634d14614SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 1734d14614SJeremy L Thompson chebyshev_x[0] = 1.0; 1834d14614SJeremy L Thompson chebyshev_x[1] = 2 * x; 1934d14614SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 2034d14614SJeremy L Thompson } 2134d14614SJeremy L Thompson 2234d14614SJeremy L Thompson template <int Q_1D> 2334d14614SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 2434d14614SJeremy L Thompson CeedScalar chebyshev_x[3]; 2534d14614SJeremy L Thompson 2634d14614SJeremy L Thompson chebyshev_x[1] = 1.0; 2734d14614SJeremy L Thompson chebyshev_x[2] = 2 * x; 2834d14614SJeremy L Thompson chebyshev_dx[0] = 0.0; 2934d14614SJeremy L Thompson chebyshev_dx[1] = 2.0; 3034d14614SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 3180c135a8SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 3280c135a8SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 3334d14614SJeremy L Thompson } 3434d14614SJeremy L Thompson } 3534d14614SJeremy L Thompson 3634d14614SJeremy L Thompson //------------------------------------------------------------------------------ 3734d14614SJeremy L Thompson // Tensor Basis Kernels AtPoints 3834d14614SJeremy L Thompson //------------------------------------------------------------------------------ 3934d14614SJeremy L Thompson 4034d14614SJeremy L Thompson //------------------------------------------------------------------------------ 4134d14614SJeremy L Thompson // Interp 4234d14614SJeremy L Thompson //------------------------------------------------------------------------------ 43*81ae6159SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 44111870feSJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 45111870feSJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 4634d14614SJeremy L Thompson const CeedInt i = threadIdx.x; 4734d14614SJeremy L Thompson 48f7c9815fSJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 4934d14614SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 5034d14614SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 5134d14614SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 5234d14614SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 5334d14614SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 5434d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 5534d14614SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 5634d14614SJeremy L Thompson } 5734d14614SJeremy L Thompson 5834d14614SJeremy L Thompson const CeedInt P = BASIS_P_1D; 5934d14614SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 60*81ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_NODES; 61*81ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_PTS; 62*81ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; 63*81ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; 64*81ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_NODES; 6534d14614SJeremy L Thompson 6634d14614SJeremy L Thompson // Apply basis element by element 67*81ae6159SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 68*81ae6159SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 69*81ae6159SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 70*81ae6159SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 71*81ae6159SJeremy L Thompson CeedInt pre = u_size; 72*81ae6159SJeremy L Thompson CeedInt post = 1; 73*81ae6159SJeremy L Thompson 74*81ae6159SJeremy L Thompson // Map to coefficients 75*81ae6159SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 76*81ae6159SJeremy L Thompson __syncthreads(); 77*81ae6159SJeremy L Thompson // Update buffers used 78*81ae6159SJeremy L Thompson pre /= P; 79*81ae6159SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 80*81ae6159SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 81*81ae6159SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 82*81ae6159SJeremy L Thompson 83*81ae6159SJeremy L Thompson // Contract along middle index 84*81ae6159SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 85*81ae6159SJeremy L Thompson const CeedInt c = k % post; 86*81ae6159SJeremy L Thompson const CeedInt j = (k / post) % Q; 87*81ae6159SJeremy L Thompson const CeedInt a = k / (post * Q); 88*81ae6159SJeremy L Thompson CeedScalar v_k = 0; 89*81ae6159SJeremy L Thompson 90*81ae6159SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 91*81ae6159SJeremy L Thompson out[k] = v_k; 92*81ae6159SJeremy L Thompson } 93*81ae6159SJeremy L Thompson post *= Q; 94*81ae6159SJeremy L Thompson } 95*81ae6159SJeremy L Thompson 96*81ae6159SJeremy L Thompson // Map to point 97*81ae6159SJeremy L Thompson __syncthreads(); 98*81ae6159SJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 99*81ae6159SJeremy L Thompson pre = BASIS_NUM_QPTS; 100*81ae6159SJeremy L Thompson post = 1; 101*81ae6159SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 102*81ae6159SJeremy L Thompson // Update buffers used 103*81ae6159SJeremy L Thompson pre /= Q; 104*81ae6159SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); 105*81ae6159SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); 106*81ae6159SJeremy L Thompson 107*81ae6159SJeremy L Thompson // Build Chebyshev polynomial values 108*81ae6159SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); 109*81ae6159SJeremy L Thompson 110*81ae6159SJeremy L Thompson // Contract along middle index 111*81ae6159SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 112*81ae6159SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 113*81ae6159SJeremy L Thompson CeedScalar v_k = 0; 114*81ae6159SJeremy L Thompson 115*81ae6159SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 116*81ae6159SJeremy L Thompson out[a * post + c] = v_k; 117*81ae6159SJeremy L Thompson } 118*81ae6159SJeremy L Thompson } 119*81ae6159SJeremy L Thompson post *= 1; 120*81ae6159SJeremy L Thompson } 121*81ae6159SJeremy L Thompson } 122*81ae6159SJeremy L Thompson } 123*81ae6159SJeremy L Thompson } 124*81ae6159SJeremy L Thompson } 125*81ae6159SJeremy L Thompson 126*81ae6159SJeremy L Thompson extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 127*81ae6159SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 128*81ae6159SJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 129*81ae6159SJeremy L Thompson const CeedInt i = threadIdx.x; 130*81ae6159SJeremy L Thompson 131*81ae6159SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 132*81ae6159SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 133*81ae6159SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 134*81ae6159SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 135*81ae6159SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 136*81ae6159SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 137*81ae6159SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 138*81ae6159SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 139*81ae6159SJeremy L Thompson } 140*81ae6159SJeremy L Thompson 141*81ae6159SJeremy L Thompson const CeedInt P = BASIS_P_1D; 142*81ae6159SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 143*81ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_PTS; 144*81ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_NODES; 145*81ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; 146*81ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; 147*81ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_PTS; 148*81ae6159SJeremy L Thompson 149*81ae6159SJeremy L Thompson // Apply basis element by element 15034d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 15134d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 152db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 153db2becc9SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 15434d14614SJeremy L Thompson CeedInt pre = 1; 15534d14614SJeremy L Thompson CeedInt post = 1; 15634d14614SJeremy L Thompson 15734d14614SJeremy L Thompson // Clear Chebyshev coeffs 15834d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 15934d14614SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 16034d14614SJeremy L Thompson } 16134d14614SJeremy L Thompson 16234d14614SJeremy L Thompson // Map from point 1632d10e82cSJeremy L Thompson __syncthreads(); 1642d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 165111870feSJeremy L Thompson if (p >= points_per_elem[elem]) continue; 16634d14614SJeremy L Thompson pre = 1; 16734d14614SJeremy L Thompson post = 1; 16834d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 16934d14614SJeremy L Thompson // Update buffers used 17034d14614SJeremy L Thompson pre /= 1; 171db2becc9SJeremy L Thompson const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); 17234d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); 17334d14614SJeremy L Thompson 17434d14614SJeremy L Thompson // Build Chebyshev polynomial values 17534d14614SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); 17634d14614SJeremy L Thompson 17734d14614SJeremy L Thompson // Contract along middle index 17834d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 17934d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 18034d14614SJeremy L Thompson if (d == BASIS_DIM - 1) { 181ad8059fcSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 18234d14614SJeremy L Thompson } else { 18334d14614SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 18434d14614SJeremy L Thompson } 18534d14614SJeremy L Thompson } 18634d14614SJeremy L Thompson } 18734d14614SJeremy L Thompson post *= Q; 18834d14614SJeremy L Thompson } 18934d14614SJeremy L Thompson } 19034d14614SJeremy L Thompson 19134d14614SJeremy L Thompson // Map from coefficients 19234d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 19334d14614SJeremy L Thompson post = 1; 19434d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 19534d14614SJeremy L Thompson __syncthreads(); 19634d14614SJeremy L Thompson // Update buffers used 19734d14614SJeremy L Thompson pre /= Q; 19834d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 19934d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 20034d14614SJeremy L Thompson const CeedInt writeLen = pre * post * P; 20134d14614SJeremy L Thompson 20234d14614SJeremy L Thompson // Contract along middle index 20334d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 20434d14614SJeremy L Thompson const CeedInt c = k % post; 20534d14614SJeremy L Thompson const CeedInt j = (k / post) % P; 20634d14614SJeremy L Thompson const CeedInt a = k / (post * P); 20734d14614SJeremy L Thompson CeedScalar v_k = 0; 20834d14614SJeremy L Thompson 20934d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 210db2becc9SJeremy L Thompson if (d == BASIS_DIM - 1) out[k] += v_k; 211db2becc9SJeremy L Thompson else out[k] = v_k; 21234d14614SJeremy L Thompson } 21334d14614SJeremy L Thompson post *= P; 21434d14614SJeremy L Thompson } 21534d14614SJeremy L Thompson } 21634d14614SJeremy L Thompson } 217*81ae6159SJeremy L Thompson } 218*81ae6159SJeremy L Thompson 219*81ae6159SJeremy L Thompson //------------------------------------------------------------------------------ 220*81ae6159SJeremy L Thompson // Grad 221*81ae6159SJeremy L Thompson //------------------------------------------------------------------------------ 222*81ae6159SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 223*81ae6159SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 224*81ae6159SJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 225*81ae6159SJeremy L Thompson const CeedInt i = threadIdx.x; 226*81ae6159SJeremy L Thompson 227*81ae6159SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 228*81ae6159SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 229*81ae6159SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 230*81ae6159SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 231*81ae6159SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 232*81ae6159SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 233*81ae6159SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 234*81ae6159SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 235*81ae6159SJeremy L Thompson } 236*81ae6159SJeremy L Thompson 237*81ae6159SJeremy L Thompson const CeedInt P = BASIS_P_1D; 238*81ae6159SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 239*81ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_NODES; 240*81ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_PTS; 241*81ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; 242*81ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; 243*81ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_NODES; 244*81ae6159SJeremy L Thompson const CeedInt u_dim_stride = 0; 245*81ae6159SJeremy L Thompson const CeedInt v_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 246*81ae6159SJeremy L Thompson 247*81ae6159SJeremy L Thompson // Apply basis element by element 24834d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 24934d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 250db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 25134d14614SJeremy L Thompson CeedInt pre = u_size; 25234d14614SJeremy L Thompson CeedInt post = 1; 25334d14614SJeremy L Thompson 25434d14614SJeremy L Thompson // Map to coefficients 25534d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 25634d14614SJeremy L Thompson __syncthreads(); 25734d14614SJeremy L Thompson // Update buffers used 25834d14614SJeremy L Thompson pre /= P; 25934d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 26034d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 26134d14614SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 26234d14614SJeremy L Thompson 26334d14614SJeremy L Thompson // Contract along middle index 26434d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 26534d14614SJeremy L Thompson const CeedInt c = k % post; 26634d14614SJeremy L Thompson const CeedInt j = (k / post) % Q; 26734d14614SJeremy L Thompson const CeedInt a = k / (post * Q); 26834d14614SJeremy L Thompson CeedScalar v_k = 0; 26934d14614SJeremy L Thompson 27034d14614SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 27134d14614SJeremy L Thompson out[k] = v_k; 27234d14614SJeremy L Thompson } 27334d14614SJeremy L Thompson post *= Q; 27434d14614SJeremy L Thompson } 27534d14614SJeremy L Thompson 27634d14614SJeremy L Thompson // Map to point 27734d14614SJeremy L Thompson __syncthreads(); 2782d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 279*81ae6159SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 280*81ae6159SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; 281*81ae6159SJeremy L Thompson 28234d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 28334d14614SJeremy L Thompson post = 1; 284*81ae6159SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 28534d14614SJeremy L Thompson // Update buffers used 28634d14614SJeremy L Thompson pre /= Q; 287*81ae6159SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); 288*81ae6159SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); 28934d14614SJeremy L Thompson 29034d14614SJeremy L Thompson // Build Chebyshev polynomial values 291*81ae6159SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 292*81ae6159SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 29334d14614SJeremy L Thompson 29434d14614SJeremy L Thompson // Contract along middle index 29534d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 29634d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 29734d14614SJeremy L Thompson CeedScalar v_k = 0; 29834d14614SJeremy L Thompson 29934d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 30034d14614SJeremy L Thompson out[a * post + c] = v_k; 30134d14614SJeremy L Thompson } 30234d14614SJeremy L Thompson } 30334d14614SJeremy L Thompson post *= 1; 30434d14614SJeremy L Thompson } 30534d14614SJeremy L Thompson } 30634d14614SJeremy L Thompson } 30734d14614SJeremy L Thompson } 30834d14614SJeremy L Thompson } 30934d14614SJeremy L Thompson } 31034d14614SJeremy L Thompson 311*81ae6159SJeremy L Thompson extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 312111870feSJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 313111870feSJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 31434d14614SJeremy L Thompson const CeedInt i = threadIdx.x; 31534d14614SJeremy L Thompson 316f7c9815fSJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 31734d14614SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 31834d14614SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 31934d14614SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 32034d14614SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 32134d14614SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 32234d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 32334d14614SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 32434d14614SJeremy L Thompson } 32534d14614SJeremy L Thompson 32634d14614SJeremy L Thompson const CeedInt P = BASIS_P_1D; 32734d14614SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 328*81ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_PTS; 329*81ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_NODES; 330*81ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; 331*81ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; 332*81ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_PTS; 333*81ae6159SJeremy L Thompson const CeedInt u_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 334*81ae6159SJeremy L Thompson const CeedInt v_dim_stride = 0; 33534d14614SJeremy L Thompson 33634d14614SJeremy L Thompson // Apply basis element by element 33734d14614SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 33834d14614SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 339db2becc9SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 34034d14614SJeremy L Thompson CeedInt pre = 1; 34134d14614SJeremy L Thompson CeedInt post = 1; 34234d14614SJeremy L Thompson 34334d14614SJeremy L Thompson // Clear Chebyshev coeffs 34434d14614SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 34534d14614SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 34634d14614SJeremy L Thompson } 34734d14614SJeremy L Thompson 34834d14614SJeremy L Thompson // Map from point 3492d10e82cSJeremy L Thompson __syncthreads(); 3502d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 351111870feSJeremy L Thompson if (p >= points_per_elem[elem]) continue; 35234d14614SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 353db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; 35434d14614SJeremy L Thompson 35534d14614SJeremy L Thompson pre = 1; 35634d14614SJeremy L Thompson post = 1; 35734d14614SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 35834d14614SJeremy L Thompson // Update buffers used 35934d14614SJeremy L Thompson pre /= 1; 36034d14614SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); 36134d14614SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); 36234d14614SJeremy L Thompson 36334d14614SJeremy L Thompson // Build Chebyshev polynomial values 36434d14614SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 36534d14614SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 36634d14614SJeremy L Thompson 36734d14614SJeremy L Thompson // Contract along middle index 36834d14614SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 36934d14614SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 37034d14614SJeremy L Thompson if (dim_2 == BASIS_DIM - 1) { 371ad8059fcSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 37234d14614SJeremy L Thompson } else { 37334d14614SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 37434d14614SJeremy L Thompson } 37534d14614SJeremy L Thompson } 37634d14614SJeremy L Thompson } 37734d14614SJeremy L Thompson post *= Q; 37834d14614SJeremy L Thompson } 37934d14614SJeremy L Thompson } 38034d14614SJeremy L Thompson } 38134d14614SJeremy L Thompson 38234d14614SJeremy L Thompson // Map from coefficients 38334d14614SJeremy L Thompson pre = BASIS_NUM_QPTS; 38434d14614SJeremy L Thompson post = 1; 38534d14614SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 38634d14614SJeremy L Thompson __syncthreads(); 38734d14614SJeremy L Thompson // Update buffers used 38834d14614SJeremy L Thompson pre /= Q; 38934d14614SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 39034d14614SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 39134d14614SJeremy L Thompson const CeedInt writeLen = pre * post * P; 39234d14614SJeremy L Thompson 39334d14614SJeremy L Thompson // Contract along middle index 39434d14614SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 39534d14614SJeremy L Thompson const CeedInt c = k % post; 39634d14614SJeremy L Thompson const CeedInt j = (k / post) % P; 39734d14614SJeremy L Thompson const CeedInt a = k / (post * P); 39834d14614SJeremy L Thompson CeedScalar v_k = 0; 39934d14614SJeremy L Thompson 40034d14614SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 401db2becc9SJeremy L Thompson if (d == BASIS_DIM - 1) out[k] += v_k; 402db2becc9SJeremy L Thompson else out[k] = v_k; 40334d14614SJeremy L Thompson } 40434d14614SJeremy L Thompson post *= P; 40534d14614SJeremy L Thompson } 40634d14614SJeremy L Thompson } 40734d14614SJeremy L Thompson } 40834d14614SJeremy L Thompson } 409