1*1c21e869SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2*1c21e869SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3*1c21e869SJeremy L Thompson // 4*1c21e869SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5*1c21e869SJeremy L Thompson // 6*1c21e869SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 7*1c21e869SJeremy L Thompson 8*1c21e869SJeremy L Thompson /// @file 9*1c21e869SJeremy L Thompson /// Internal header for CUDA tensor product basis with AtPoints evaluation 10*1c21e869SJeremy L Thompson 11*1c21e869SJeremy L Thompson #include <ceed.h> 12*1c21e869SJeremy L Thompson 13*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 14*1c21e869SJeremy L Thompson // Chebyshev values 15*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 16*1c21e869SJeremy L Thompson template <int Q_1D> 17*1c21e869SJeremy L Thompson inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 18*1c21e869SJeremy L Thompson chebyshev_x[0] = 1.0; 19*1c21e869SJeremy L Thompson chebyshev_x[1] = 2 * x; 20*1c21e869SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 21*1c21e869SJeremy L Thompson } 22*1c21e869SJeremy L Thompson 23*1c21e869SJeremy L Thompson template <int Q_1D> 24*1c21e869SJeremy L Thompson inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 25*1c21e869SJeremy L Thompson CeedScalar chebyshev_x[3]; 26*1c21e869SJeremy L Thompson 27*1c21e869SJeremy L Thompson chebyshev_x[1] = 1.0; 28*1c21e869SJeremy L Thompson chebyshev_x[2] = 2 * x; 29*1c21e869SJeremy L Thompson chebyshev_dx[0] = 0.0; 30*1c21e869SJeremy L Thompson chebyshev_dx[1] = 2.0; 31*1c21e869SJeremy L Thompson for (CeedInt i = 2; i < Q_1D; i++) { 32*1c21e869SJeremy L Thompson chebyshev_x[0] = chebyshev_x[1]; 33*1c21e869SJeremy L Thompson chebyshev_x[1] = chebyshev_x[2]; 34*1c21e869SJeremy L Thompson chebyshev_x[2] = 2 * x * chebyshev_x[1] - chebyshev_x[0]; 35*1c21e869SJeremy L Thompson chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[1] - chebyshev_dx[i - 2]; 36*1c21e869SJeremy L Thompson } 37*1c21e869SJeremy L Thompson } 38*1c21e869SJeremy L Thompson 39*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 40*1c21e869SJeremy L Thompson // Tensor Basis Kernels AtPoints 41*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 42*1c21e869SJeremy L Thompson 43*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 44*1c21e869SJeremy L Thompson // Interp 45*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 46*1c21e869SJeremy L Thompson extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 47*1c21e869SJeremy L Thompson const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 48*1c21e869SJeremy L Thompson const CeedInt i = threadIdx.x; 49*1c21e869SJeremy L Thompson 50*1c21e869SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN]; 51*1c21e869SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 52*1c21e869SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 53*1c21e869SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 54*1c21e869SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 55*1c21e869SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 56*1c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 57*1c21e869SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 58*1c21e869SJeremy L Thompson } 59*1c21e869SJeremy L Thompson 60*1c21e869SJeremy L Thompson const CeedInt P = BASIS_P_1D; 61*1c21e869SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 62*1c21e869SJeremy L Thompson const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 63*1c21e869SJeremy L Thompson const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 64*1c21e869SJeremy L Thompson const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 65*1c21e869SJeremy L Thompson const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 66*1c21e869SJeremy L Thompson const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 67*1c21e869SJeremy L Thompson 68*1c21e869SJeremy L Thompson // Apply basis element by element 69*1c21e869SJeremy L Thompson if (is_transpose) { 70*1c21e869SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 71*1c21e869SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 72*1c21e869SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 73*1c21e869SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 74*1c21e869SJeremy L Thompson CeedInt pre = 1; 75*1c21e869SJeremy L Thompson CeedInt post = 1; 76*1c21e869SJeremy L Thompson 77*1c21e869SJeremy L Thompson // Clear Chebyshev coeffs 78*1c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 79*1c21e869SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 80*1c21e869SJeremy L Thompson } 81*1c21e869SJeremy L Thompson 82*1c21e869SJeremy L Thompson // Map from point 83*1c21e869SJeremy L Thompson for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) { 84*1c21e869SJeremy L Thompson pre = 1; 85*1c21e869SJeremy L Thompson post = 1; 86*1c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 87*1c21e869SJeremy L Thompson // Update buffers used 88*1c21e869SJeremy L Thompson pre /= 1; 89*1c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? (cur_u + p) : (d % 2 ? buffer_2 : buffer_1); 90*1c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); 91*1c21e869SJeremy L Thompson 92*1c21e869SJeremy L Thompson // Build Chebyshev polynomial values 93*1c21e869SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); 94*1c21e869SJeremy L Thompson 95*1c21e869SJeremy L Thompson // Contract along middle index 96*1c21e869SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 97*1c21e869SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 98*1c21e869SJeremy L Thompson if (d == BASIS_DIM - 1) { 99*1c21e869SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c]; 100*1c21e869SJeremy L Thompson } else { 101*1c21e869SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 102*1c21e869SJeremy L Thompson } 103*1c21e869SJeremy L Thompson } 104*1c21e869SJeremy L Thompson } 105*1c21e869SJeremy L Thompson post *= Q; 106*1c21e869SJeremy L Thompson } 107*1c21e869SJeremy L Thompson } 108*1c21e869SJeremy L Thompson 109*1c21e869SJeremy L Thompson // Map from coefficients 110*1c21e869SJeremy L Thompson pre = BASIS_NUM_QPTS; 111*1c21e869SJeremy L Thompson post = 1; 112*1c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 113*1c21e869SJeremy L Thompson __syncthreads(); 114*1c21e869SJeremy L Thompson // Update buffers used 115*1c21e869SJeremy L Thompson pre /= Q; 116*1c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 117*1c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 118*1c21e869SJeremy L Thompson const CeedInt writeLen = pre * post * P; 119*1c21e869SJeremy L Thompson 120*1c21e869SJeremy L Thompson // Contract along middle index 121*1c21e869SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 122*1c21e869SJeremy L Thompson const CeedInt c = k % post; 123*1c21e869SJeremy L Thompson const CeedInt j = (k / post) % P; 124*1c21e869SJeremy L Thompson const CeedInt a = k / (post * P); 125*1c21e869SJeremy L Thompson CeedScalar v_k = 0; 126*1c21e869SJeremy L Thompson 127*1c21e869SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 128*1c21e869SJeremy L Thompson out[k] = v_k; 129*1c21e869SJeremy L Thompson } 130*1c21e869SJeremy L Thompson post *= P; 131*1c21e869SJeremy L Thompson } 132*1c21e869SJeremy L Thompson } 133*1c21e869SJeremy L Thompson } 134*1c21e869SJeremy L Thompson } else { 135*1c21e869SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 136*1c21e869SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 137*1c21e869SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 138*1c21e869SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 139*1c21e869SJeremy L Thompson CeedInt pre = u_size; 140*1c21e869SJeremy L Thompson CeedInt post = 1; 141*1c21e869SJeremy L Thompson 142*1c21e869SJeremy L Thompson // Map to coefficients 143*1c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 144*1c21e869SJeremy L Thompson __syncthreads(); 145*1c21e869SJeremy L Thompson // Update buffers used 146*1c21e869SJeremy L Thompson pre /= P; 147*1c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 148*1c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 149*1c21e869SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 150*1c21e869SJeremy L Thompson 151*1c21e869SJeremy L Thompson // Contract along middle index 152*1c21e869SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 153*1c21e869SJeremy L Thompson const CeedInt c = k % post; 154*1c21e869SJeremy L Thompson const CeedInt j = (k / post) % Q; 155*1c21e869SJeremy L Thompson const CeedInt a = k / (post * Q); 156*1c21e869SJeremy L Thompson CeedScalar v_k = 0; 157*1c21e869SJeremy L Thompson 158*1c21e869SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 159*1c21e869SJeremy L Thompson out[k] = v_k; 160*1c21e869SJeremy L Thompson } 161*1c21e869SJeremy L Thompson post *= Q; 162*1c21e869SJeremy L Thompson } 163*1c21e869SJeremy L Thompson 164*1c21e869SJeremy L Thompson // Map to point 165*1c21e869SJeremy L Thompson __syncthreads(); 166*1c21e869SJeremy L Thompson for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) { 167*1c21e869SJeremy L Thompson pre = BASIS_NUM_QPTS; 168*1c21e869SJeremy L Thompson post = 1; 169*1c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 170*1c21e869SJeremy L Thompson // Update buffers used 171*1c21e869SJeremy L Thompson pre /= Q; 172*1c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); 173*1c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? (cur_v + p) : (d % 2 ? buffer_1 : buffer_2); 174*1c21e869SJeremy L Thompson 175*1c21e869SJeremy L Thompson // Build Chebyshev polynomial values 176*1c21e869SJeremy L Thompson ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); 177*1c21e869SJeremy L Thompson 178*1c21e869SJeremy L Thompson // Contract along middle index 179*1c21e869SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 180*1c21e869SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 181*1c21e869SJeremy L Thompson CeedScalar v_k = 0; 182*1c21e869SJeremy L Thompson 183*1c21e869SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 184*1c21e869SJeremy L Thompson out[a * post + c] = v_k; 185*1c21e869SJeremy L Thompson } 186*1c21e869SJeremy L Thompson } 187*1c21e869SJeremy L Thompson post *= 1; 188*1c21e869SJeremy L Thompson } 189*1c21e869SJeremy L Thompson } 190*1c21e869SJeremy L Thompson } 191*1c21e869SJeremy L Thompson } 192*1c21e869SJeremy L Thompson } 193*1c21e869SJeremy L Thompson } 194*1c21e869SJeremy L Thompson 195*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 196*1c21e869SJeremy L Thompson // Grad 197*1c21e869SJeremy L Thompson //------------------------------------------------------------------------------ 198*1c21e869SJeremy L Thompson extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 199*1c21e869SJeremy L Thompson const CeedScalar *__restrict__ coords, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 200*1c21e869SJeremy L Thompson const CeedInt i = threadIdx.x; 201*1c21e869SJeremy L Thompson 202*1c21e869SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 3 * BASIS_BUF_LEN]; 203*1c21e869SJeremy L Thompson CeedScalar *s_chebyshev_interp_1d = s_mem; 204*1c21e869SJeremy L Thompson CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 205*1c21e869SJeremy L Thompson CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 206*1c21e869SJeremy L Thompson CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 207*1c21e869SJeremy L Thompson CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 208*1c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 209*1c21e869SJeremy L Thompson s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 210*1c21e869SJeremy L Thompson } 211*1c21e869SJeremy L Thompson 212*1c21e869SJeremy L Thompson const CeedInt P = BASIS_P_1D; 213*1c21e869SJeremy L Thompson const CeedInt Q = BASIS_Q_1D; 214*1c21e869SJeremy L Thompson const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 215*1c21e869SJeremy L Thompson const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 216*1c21e869SJeremy L Thompson const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 217*1c21e869SJeremy L Thompson const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 218*1c21e869SJeremy L Thompson const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 219*1c21e869SJeremy L Thompson const CeedInt u_dim_stride = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0; 220*1c21e869SJeremy L Thompson const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 221*1c21e869SJeremy L Thompson 222*1c21e869SJeremy L Thompson // Apply basis element by element 223*1c21e869SJeremy L Thompson if (is_transpose) { 224*1c21e869SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 225*1c21e869SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 226*1c21e869SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 227*1c21e869SJeremy L Thompson CeedInt pre = 1; 228*1c21e869SJeremy L Thompson CeedInt post = 1; 229*1c21e869SJeremy L Thompson 230*1c21e869SJeremy L Thompson // Clear Chebyshev coeffs 231*1c21e869SJeremy L Thompson for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 232*1c21e869SJeremy L Thompson s_chebyshev_coeffs[k] = 0.0; 233*1c21e869SJeremy L Thompson } 234*1c21e869SJeremy L Thompson 235*1c21e869SJeremy L Thompson // Map from point 236*1c21e869SJeremy L Thompson for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) { 237*1c21e869SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 238*1c21e869SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; 239*1c21e869SJeremy L Thompson 240*1c21e869SJeremy L Thompson pre = 1; 241*1c21e869SJeremy L Thompson post = 1; 242*1c21e869SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 243*1c21e869SJeremy L Thompson // Update buffers used 244*1c21e869SJeremy L Thompson pre /= 1; 245*1c21e869SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); 246*1c21e869SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); 247*1c21e869SJeremy L Thompson 248*1c21e869SJeremy L Thompson // Build Chebyshev polynomial values 249*1c21e869SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 250*1c21e869SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 251*1c21e869SJeremy L Thompson 252*1c21e869SJeremy L Thompson // Contract along middle index 253*1c21e869SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 254*1c21e869SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 255*1c21e869SJeremy L Thompson if (dim_2 == BASIS_DIM - 1) { 256*1c21e869SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] += chebyshev_x[j] * in[a * post + c]; 257*1c21e869SJeremy L Thompson } else { 258*1c21e869SJeremy L Thompson for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 259*1c21e869SJeremy L Thompson } 260*1c21e869SJeremy L Thompson } 261*1c21e869SJeremy L Thompson } 262*1c21e869SJeremy L Thompson post *= Q; 263*1c21e869SJeremy L Thompson } 264*1c21e869SJeremy L Thompson } 265*1c21e869SJeremy L Thompson } 266*1c21e869SJeremy L Thompson 267*1c21e869SJeremy L Thompson // Map from coefficients 268*1c21e869SJeremy L Thompson pre = BASIS_NUM_QPTS; 269*1c21e869SJeremy L Thompson post = 1; 270*1c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 271*1c21e869SJeremy L Thompson __syncthreads(); 272*1c21e869SJeremy L Thompson // Update buffers used 273*1c21e869SJeremy L Thompson pre /= Q; 274*1c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 275*1c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 276*1c21e869SJeremy L Thompson const CeedInt writeLen = pre * post * P; 277*1c21e869SJeremy L Thompson 278*1c21e869SJeremy L Thompson // Contract along middle index 279*1c21e869SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 280*1c21e869SJeremy L Thompson const CeedInt c = k % post; 281*1c21e869SJeremy L Thompson const CeedInt j = (k / post) % P; 282*1c21e869SJeremy L Thompson const CeedInt a = k / (post * P); 283*1c21e869SJeremy L Thompson CeedScalar v_k = 0; 284*1c21e869SJeremy L Thompson 285*1c21e869SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 286*1c21e869SJeremy L Thompson out[k] = v_k; 287*1c21e869SJeremy L Thompson } 288*1c21e869SJeremy L Thompson post *= P; 289*1c21e869SJeremy L Thompson } 290*1c21e869SJeremy L Thompson } 291*1c21e869SJeremy L Thompson } 292*1c21e869SJeremy L Thompson } else { 293*1c21e869SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 294*1c21e869SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 295*1c21e869SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 296*1c21e869SJeremy L Thompson CeedInt pre = u_size; 297*1c21e869SJeremy L Thompson CeedInt post = 1; 298*1c21e869SJeremy L Thompson 299*1c21e869SJeremy L Thompson // Map to coefficients 300*1c21e869SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 301*1c21e869SJeremy L Thompson __syncthreads(); 302*1c21e869SJeremy L Thompson // Update buffers used 303*1c21e869SJeremy L Thompson pre /= P; 304*1c21e869SJeremy L Thompson const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 305*1c21e869SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 306*1c21e869SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 307*1c21e869SJeremy L Thompson 308*1c21e869SJeremy L Thompson // Contract along middle index 309*1c21e869SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 310*1c21e869SJeremy L Thompson const CeedInt c = k % post; 311*1c21e869SJeremy L Thompson const CeedInt j = (k / post) % Q; 312*1c21e869SJeremy L Thompson const CeedInt a = k / (post * Q); 313*1c21e869SJeremy L Thompson CeedScalar v_k = 0; 314*1c21e869SJeremy L Thompson 315*1c21e869SJeremy L Thompson for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 316*1c21e869SJeremy L Thompson out[k] = v_k; 317*1c21e869SJeremy L Thompson } 318*1c21e869SJeremy L Thompson post *= Q; 319*1c21e869SJeremy L Thompson } 320*1c21e869SJeremy L Thompson 321*1c21e869SJeremy L Thompson // Map to point 322*1c21e869SJeremy L Thompson __syncthreads(); 323*1c21e869SJeremy L Thompson for (CeedInt p = blockIdx.x; p < BASIS_NUM_PTS; p += gridDim.x) { 324*1c21e869SJeremy L Thompson for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 325*1c21e869SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; 326*1c21e869SJeremy L Thompson 327*1c21e869SJeremy L Thompson pre = BASIS_NUM_QPTS; 328*1c21e869SJeremy L Thompson post = 1; 329*1c21e869SJeremy L Thompson for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 330*1c21e869SJeremy L Thompson // Update buffers used 331*1c21e869SJeremy L Thompson pre /= Q; 332*1c21e869SJeremy L Thompson const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); 333*1c21e869SJeremy L Thompson CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); 334*1c21e869SJeremy L Thompson 335*1c21e869SJeremy L Thompson // Build Chebyshev polynomial values 336*1c21e869SJeremy L Thompson if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 337*1c21e869SJeremy L Thompson else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 338*1c21e869SJeremy L Thompson 339*1c21e869SJeremy L Thompson // Contract along middle index 340*1c21e869SJeremy L Thompson for (CeedInt a = 0; a < pre; a++) { 341*1c21e869SJeremy L Thompson for (CeedInt c = 0; c < post; c++) { 342*1c21e869SJeremy L Thompson CeedScalar v_k = 0; 343*1c21e869SJeremy L Thompson 344*1c21e869SJeremy L Thompson for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 345*1c21e869SJeremy L Thompson out[a * post + c] = v_k; 346*1c21e869SJeremy L Thompson } 347*1c21e869SJeremy L Thompson } 348*1c21e869SJeremy L Thompson post *= 1; 349*1c21e869SJeremy L Thompson } 350*1c21e869SJeremy L Thompson } 351*1c21e869SJeremy L Thompson } 352*1c21e869SJeremy L Thompson } 353*1c21e869SJeremy L Thompson } 354*1c21e869SJeremy L Thompson } 355*1c21e869SJeremy L Thompson } 356