11c21e869SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 21c21e869SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 31c21e869SJeremy L Thompson // 41c21e869SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 51c21e869SJeremy L Thompson // 61c21e869SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 71c21e869SJeremy L Thompson 81c21e869SJeremy L Thompson /// @file 91c21e869SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation 10c0b5abf0SJeremy L Thompson #include <ceed/types.h> 111c21e869SJeremy L Thompson 121c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 131c21e869SJeremy L Thompson // Chebyshev values 141c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 151c21e869SJeremy L Thompson template <int Q_1D> 161c21e869SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 171c21e869SJeremy L Thompson chebyshev_x[0] = 1.0; 181c21e869SJeremy L Thompson chebyshev_x[1] = 2 * x; 191c21e869SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 201c21e869SJeremy L Thompson } 211c21e869SJeremy L Thompson 221c21e869SJeremy L Thompson template <int Q_1D> 231c21e869SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 241c21e869SJeremy L Thompson CeedScalar chebyshev_x[3]; 251c21e869SJeremy L Thompson 261c21e869SJeremy L Thompson chebyshev_x[1] = 1.0; 271c21e869SJeremy L Thompson chebyshev_x[2] = 2 * x; 281c21e869SJeremy L Thompson chebyshev_dx[0] = 0.0; 291c21e869SJeremy L Thompson chebyshev_dx[1] = 2.0; 301c21e869SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 3180c135a8SJeremy L Thompson chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 3280c135a8SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 331c21e869SJeremy L Thompson } 341c21e869SJeremy L Thompson } 351c21e869SJeremy L Thompson 361c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 371c21e869SJeremy L Thompson // Tensor Basis Kernels AtPoints 381c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 391c21e869SJeremy L Thompson 401c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 411c21e869SJeremy L Thompson // Interp 421c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 43*81ae6159SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 44111870feSJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 45111870feSJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 461c21e869SJeremy L Thompson const CeedInt i = threadIdx.x; 471c21e869SJeremy L Thompson 48f7c9815fSJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 491c21e869SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 501c21e869SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 511c21e869SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 521c21e869SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 531c21e869SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 541c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 551c21e869SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 561c21e869SJeremy L Thompson } 571c21e869SJeremy L Thompson 581c21e869SJeremy L Thompson const CeedInt P = BASIS_P_1D; 591c21e869SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 60*81ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_NODES; 61*81ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_PTS; 62*81ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; 63*81ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; 64*81ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_NODES; 651c21e869SJeremy L Thompson 661c21e869SJeremy L Thompson // Apply basis element by element 67*81ae6159SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 68*81ae6159SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 69*81ae6159SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 70*81ae6159SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 71*81ae6159SJeremy L Thompson CeedInt pre = u_size; 72*81ae6159SJeremy L Thompson CeedInt post = 1; 73*81ae6159SJeremy L Thompson 74*81ae6159SJeremy L Thompson // Map to coefficients 75*81ae6159SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 76*81ae6159SJeremy L Thompson __syncthreads(); 77*81ae6159SJeremy L Thompson // Update buffers used 78*81ae6159SJeremy L Thompson pre /= P; 79*81ae6159SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 80*81ae6159SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 81*81ae6159SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 82*81ae6159SJeremy L Thompson 83*81ae6159SJeremy L Thompson // Contract along middle index 84*81ae6159SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 85*81ae6159SJeremy L Thompson const CeedInt c = k % post; 86*81ae6159SJeremy L Thompson const CeedInt j = (k / post) % Q; 87*81ae6159SJeremy L Thompson const CeedInt a = k / (post * Q); 88*81ae6159SJeremy L Thompson CeedScalar v_k = 0; 89*81ae6159SJeremy L Thompson 90*81ae6159SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 91*81ae6159SJeremy L Thompson out[k] = v_k; 92*81ae6159SJeremy L Thompson } 93*81ae6159SJeremy L Thompson post *= Q; 94*81ae6159SJeremy L Thompson } 95*81ae6159SJeremy L Thompson 96*81ae6159SJeremy L Thompson // Map to point 97*81ae6159SJeremy L Thompson __syncthreads(); 98*81ae6159SJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 99*81ae6159SJeremy L Thompson pre = BASIS_NUM_QPTS; 100*81ae6159SJeremy L Thompson post = 1; 101*81ae6159SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 102*81ae6159SJeremy L Thompson // Update buffers used 103*81ae6159SJeremy L Thompson pre /= Q; 104*81ae6159SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); 105*81ae6159SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); 106*81ae6159SJeremy L Thompson 107*81ae6159SJeremy L Thompson // Build Chebyshev polynomial values 108*81ae6159SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); 109*81ae6159SJeremy L Thompson 110*81ae6159SJeremy L Thompson // Contract along middle index 111*81ae6159SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 112*81ae6159SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 113*81ae6159SJeremy L Thompson CeedScalar v_k = 0; 114*81ae6159SJeremy L Thompson 115*81ae6159SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 116*81ae6159SJeremy L Thompson out[a * post + c] = v_k; 117*81ae6159SJeremy L Thompson } 118*81ae6159SJeremy L Thompson } 119*81ae6159SJeremy L Thompson post *= 1; 120*81ae6159SJeremy L Thompson } 121*81ae6159SJeremy L Thompson } 122*81ae6159SJeremy L Thompson } 123*81ae6159SJeremy L Thompson } 124*81ae6159SJeremy L Thompson } 125*81ae6159SJeremy L Thompson 126*81ae6159SJeremy L Thompson extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 127*81ae6159SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 128*81ae6159SJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 129*81ae6159SJeremy L Thompson const CeedInt i = threadIdx.x; 130*81ae6159SJeremy L Thompson 131*81ae6159SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 132*81ae6159SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 133*81ae6159SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 134*81ae6159SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 135*81ae6159SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 136*81ae6159SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 137*81ae6159SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 138*81ae6159SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 139*81ae6159SJeremy L Thompson } 140*81ae6159SJeremy L Thompson 141*81ae6159SJeremy L Thompson const CeedInt P = BASIS_P_1D; 142*81ae6159SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 143*81ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_PTS; 144*81ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_NODES; 145*81ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; 146*81ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; 147*81ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_PTS; 148*81ae6159SJeremy L Thompson 149*81ae6159SJeremy L Thompson // Apply basis element by element 1501c21e869SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 1511c21e869SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 152db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 153db2becc9SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 1541c21e869SJeremy L Thompson CeedInt pre = 1; 1551c21e869SJeremy L Thompson CeedInt post = 1; 1561c21e869SJeremy L Thompson 1571c21e869SJeremy L Thompson // Clear Chebyshev coeffs 1581c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 1591c21e869SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 1601c21e869SJeremy L Thompson } 1611c21e869SJeremy L Thompson 1621c21e869SJeremy L Thompson // Map from point 1632d10e82cSJeremy L Thompson __syncthreads(); 1642d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 165111870feSJeremy L Thompson if (p >= points_per_elem[elem]) continue; 1661c21e869SJeremy L Thompson pre = 1; 1671c21e869SJeremy L Thompson post = 1; 1681c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 1691c21e869SJeremy L Thompson // Update buffers used 1701c21e869SJeremy L Thompson pre /= 1; 171db2becc9SJeremy L Thompson const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); 1721c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); 1731c21e869SJeremy L Thompson 1741c21e869SJeremy L Thompson // Build Chebyshev polynomial values 1751c21e869SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); 1761c21e869SJeremy L Thompson 1771c21e869SJeremy L Thompson // Contract along middle index 1781c21e869SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 1791c21e869SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 1801c21e869SJeremy L Thompson if (d == BASIS_DIM - 1) { 181ad8059fcSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 1821c21e869SJeremy L Thompson } else { 1831c21e869SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 1841c21e869SJeremy L Thompson } 1851c21e869SJeremy L Thompson } 1861c21e869SJeremy L Thompson } 1871c21e869SJeremy L Thompson post *= Q; 1881c21e869SJeremy L Thompson } 1891c21e869SJeremy L Thompson } 1901c21e869SJeremy L Thompson 1911c21e869SJeremy L Thompson // Map from coefficients 1921c21e869SJeremy L Thompson pre = BASIS_NUM_QPTS; 1931c21e869SJeremy L Thompson post = 1; 1941c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 1951c21e869SJeremy L Thompson __syncthreads(); 1961c21e869SJeremy L Thompson // Update buffers used 1971c21e869SJeremy L Thompson pre /= Q; 1981c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 1991c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 2001c21e869SJeremy L Thompson const CeedInt writeLen = pre * post * P; 2011c21e869SJeremy L Thompson 2021c21e869SJeremy L Thompson // Contract along middle index 2031c21e869SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 2041c21e869SJeremy L Thompson const CeedInt c = k % post; 2051c21e869SJeremy L Thompson const CeedInt j = (k / post) % P; 2061c21e869SJeremy L Thompson const CeedInt a = k / (post * P); 2071c21e869SJeremy L Thompson CeedScalar v_k = 0; 2081c21e869SJeremy L Thompson 2091c21e869SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 210db2becc9SJeremy L Thompson if (d == BASIS_DIM - 1) out[k] += v_k; 211db2becc9SJeremy L Thompson else out[k] = v_k; 2121c21e869SJeremy L Thompson } 2131c21e869SJeremy L Thompson post *= P; 2141c21e869SJeremy L Thompson } 2151c21e869SJeremy L Thompson } 2161c21e869SJeremy L Thompson } 217*81ae6159SJeremy L Thompson } 218*81ae6159SJeremy L Thompson 219*81ae6159SJeremy L Thompson //------------------------------------------------------------------------------ 220*81ae6159SJeremy L Thompson // Grad 221*81ae6159SJeremy L Thompson //------------------------------------------------------------------------------ 222*81ae6159SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 223*81ae6159SJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 224*81ae6159SJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 225*81ae6159SJeremy L Thompson const CeedInt i = threadIdx.x; 226*81ae6159SJeremy L Thompson 227*81ae6159SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 228*81ae6159SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 229*81ae6159SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 230*81ae6159SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 231*81ae6159SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 232*81ae6159SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 233*81ae6159SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 234*81ae6159SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 235*81ae6159SJeremy L Thompson } 236*81ae6159SJeremy L Thompson 237*81ae6159SJeremy L Thompson const CeedInt P = BASIS_P_1D; 238*81ae6159SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 239*81ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_NODES; 240*81ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_PTS; 241*81ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; 242*81ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; 243*81ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_NODES; 244*81ae6159SJeremy L Thompson const CeedInt u_dim_stride = 0; 245*81ae6159SJeremy L Thompson const CeedInt v_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 246*81ae6159SJeremy L Thompson 247*81ae6159SJeremy L Thompson // Apply basis element by element 2481c21e869SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 2491c21e869SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 250db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 2511c21e869SJeremy L Thompson CeedInt pre = u_size; 2521c21e869SJeremy L Thompson CeedInt post = 1; 2531c21e869SJeremy L Thompson 2541c21e869SJeremy L Thompson // Map to coefficients 2551c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 2561c21e869SJeremy L Thompson __syncthreads(); 2571c21e869SJeremy L Thompson // Update buffers used 2581c21e869SJeremy L Thompson pre /= P; 2591c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 2601c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 2611c21e869SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 2621c21e869SJeremy L Thompson 2631c21e869SJeremy L Thompson // Contract along middle index 2641c21e869SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 2651c21e869SJeremy L Thompson const CeedInt c = k % post; 2661c21e869SJeremy L Thompson const CeedInt j = (k / post) % Q; 2671c21e869SJeremy L Thompson const CeedInt a = k / (post * Q); 2681c21e869SJeremy L Thompson CeedScalar v_k = 0; 2691c21e869SJeremy L Thompson 2701c21e869SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 2711c21e869SJeremy L Thompson out[k] = v_k; 2721c21e869SJeremy L Thompson } 2731c21e869SJeremy L Thompson post *= Q; 2741c21e869SJeremy L Thompson } 2751c21e869SJeremy L Thompson 2761c21e869SJeremy L Thompson // Map to point 2771c21e869SJeremy L Thompson __syncthreads(); 2782d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 279*81ae6159SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 280*81ae6159SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; 281*81ae6159SJeremy L Thompson 2821c21e869SJeremy L Thompson pre = BASIS_NUM_QPTS; 2831c21e869SJeremy L Thompson post = 1; 284*81ae6159SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 2851c21e869SJeremy L Thompson // Update buffers used 2861c21e869SJeremy L Thompson pre /= Q; 287*81ae6159SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); 288*81ae6159SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2); 2891c21e869SJeremy L Thompson 2901c21e869SJeremy L Thompson // Build Chebyshev polynomial values 291*81ae6159SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 292*81ae6159SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 2931c21e869SJeremy L Thompson 2941c21e869SJeremy L Thompson // Contract along middle index 2951c21e869SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 2961c21e869SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 2971c21e869SJeremy L Thompson CeedScalar v_k = 0; 2981c21e869SJeremy L Thompson 2991c21e869SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 3001c21e869SJeremy L Thompson out[a * post + c] = v_k; 3011c21e869SJeremy L Thompson } 3021c21e869SJeremy L Thompson } 3031c21e869SJeremy L Thompson post *= 1; 3041c21e869SJeremy L Thompson } 3051c21e869SJeremy L Thompson } 3061c21e869SJeremy L Thompson } 3071c21e869SJeremy L Thompson } 3081c21e869SJeremy L Thompson } 3091c21e869SJeremy L Thompson } 3101c21e869SJeremy L Thompson 311*81ae6159SJeremy L Thompson extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 312111870feSJeremy L Thompson const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 313111870feSJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 3141c21e869SJeremy L Thompson const CeedInt i = threadIdx.x; 3151c21e869SJeremy L Thompson 316f7c9815fSJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 3171c21e869SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 3181c21e869SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 3191c21e869SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 3201c21e869SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 3211c21e869SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 3221c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 3231c21e869SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 3241c21e869SJeremy L Thompson } 3251c21e869SJeremy L Thompson 3261c21e869SJeremy L Thompson const CeedInt P = BASIS_P_1D; 3271c21e869SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 328*81ae6159SJeremy L Thompson const CeedInt u_stride = BASIS_NUM_PTS; 329*81ae6159SJeremy L Thompson const CeedInt v_stride = BASIS_NUM_NODES; 330*81ae6159SJeremy L Thompson const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; 331*81ae6159SJeremy L Thompson const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; 332*81ae6159SJeremy L Thompson const CeedInt u_size = BASIS_NUM_PTS; 333*81ae6159SJeremy L Thompson const CeedInt u_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 334*81ae6159SJeremy L Thompson const CeedInt v_dim_stride = 0; 3351c21e869SJeremy L Thompson 3361c21e869SJeremy L Thompson // Apply basis element by element 3371c21e869SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 3381c21e869SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 339db2becc9SJeremy L Thompson CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 3401c21e869SJeremy L Thompson CeedInt pre = 1; 3411c21e869SJeremy L Thompson CeedInt post = 1; 3421c21e869SJeremy L Thompson 3431c21e869SJeremy L Thompson // Clear Chebyshev coeffs 3441c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 3451c21e869SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 3461c21e869SJeremy L Thompson } 3471c21e869SJeremy L Thompson 3481c21e869SJeremy L Thompson // Map from point 3492d10e82cSJeremy L Thompson __syncthreads(); 3502d10e82cSJeremy L Thompson for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 351111870feSJeremy L Thompson if (p >= points_per_elem[elem]) continue; 3521c21e869SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 353db2becc9SJeremy L Thompson const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; 3541c21e869SJeremy L Thompson 3551c21e869SJeremy L Thompson pre = 1; 3561c21e869SJeremy L Thompson post = 1; 3571c21e869SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 3581c21e869SJeremy L Thompson // Update buffers used 3591c21e869SJeremy L Thompson pre /= 1; 360db2becc9SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1); 3611c21e869SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); 3621c21e869SJeremy L Thompson 3631c21e869SJeremy L Thompson // Build Chebyshev polynomial values 3641c21e869SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 3651c21e869SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 3661c21e869SJeremy L Thompson 3671c21e869SJeremy L Thompson // Contract along middle index 3681c21e869SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 3691c21e869SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 3701c21e869SJeremy L Thompson if (dim_2 == BASIS_DIM - 1) { 371ad8059fcSJeremy L Thompson for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 3721c21e869SJeremy L Thompson } else { 3731c21e869SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 3741c21e869SJeremy L Thompson } 3751c21e869SJeremy L Thompson } 3761c21e869SJeremy L Thompson } 3771c21e869SJeremy L Thompson post *= Q; 3781c21e869SJeremy L Thompson } 3791c21e869SJeremy L Thompson } 3801c21e869SJeremy L Thompson } 3811c21e869SJeremy L Thompson 3821c21e869SJeremy L Thompson // Map from coefficients 3831c21e869SJeremy L Thompson pre = BASIS_NUM_QPTS; 3841c21e869SJeremy L Thompson post = 1; 3851c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 3861c21e869SJeremy L Thompson __syncthreads(); 3871c21e869SJeremy L Thompson // Update buffers used 3881c21e869SJeremy L Thompson pre /= Q; 3891c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 3901c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 3911c21e869SJeremy L Thompson const CeedInt writeLen = pre * post * P; 3921c21e869SJeremy L Thompson 3931c21e869SJeremy L Thompson // Contract along middle index 3941c21e869SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 3951c21e869SJeremy L Thompson const CeedInt c = k % post; 3961c21e869SJeremy L Thompson const CeedInt j = (k / post) % P; 3971c21e869SJeremy L Thompson const CeedInt a = k / (post * P); 3981c21e869SJeremy L Thompson CeedScalar v_k = 0; 3991c21e869SJeremy L Thompson 4001c21e869SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 401db2becc9SJeremy L Thompson if (d == BASIS_DIM - 1) out[k] += v_k; 402db2becc9SJeremy L Thompson else out[k] = v_k; 4031c21e869SJeremy L Thompson } 4041c21e869SJeremy L Thompson post *= P; 4051c21e869SJeremy L Thompson } 4061c21e869SJeremy L Thompson } 4071c21e869SJeremy L Thompson } 4081c21e869SJeremy L Thompson } 409