1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA tensor product basis with AtPoints evaluation 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Chebyshev values 14 //------------------------------------------------------------------------------ 15 template <int Q_1D> 16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 17 chebyshev_x[0] = 1.0; 18 chebyshev_x[1] = 2 * x; 19 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 20 } 21 22 template <int Q_1D> 23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 24 CeedScalar chebyshev_x[3]; 25 26 chebyshev_x[1] = 1.0; 27 chebyshev_x[2] = 2 * x; 28 chebyshev_dx[0] = 0.0; 29 chebyshev_dx[1] = 2.0; 30 for (CeedInt i = 2; i < Q_1D; i++) { 31 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 32 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 33 } 34 } 35 36 //------------------------------------------------------------------------------ 37 // Tensor Basis Kernels AtPoints 38 //------------------------------------------------------------------------------ 39 40 //------------------------------------------------------------------------------ 41 // Interp 42 //------------------------------------------------------------------------------ 43 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 44 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 45 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 46 const CeedInt i = threadIdx.x; 47 48 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 49 CeedScalar *s_chebyshev_interp_1d = s_mem; 50 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 51 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 52 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 53 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 54 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 55 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 56 } 57 58 const CeedInt P = BASIS_P_1D; 59 const CeedInt Q = BASIS_Q_1D; 60 const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 61 const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 62 const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 63 const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 64 const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 65 66 // Apply basis element by element 67 if (is_transpose) { 68 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 69 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 70 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 71 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 72 CeedInt pre = 1; 73 CeedInt post = 1; 74 75 // Clear Chebyshev coeffs 76 for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 77 s_chebyshev_coeffs[k] = 0.0; 78 } 79 80 // Map from point 81 __syncthreads(); 82 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 83 if (p >= points_per_elem[elem]) continue; 84 pre = 1; 85 post = 1; 86 for (CeedInt d = 0; d < BASIS_DIM; d++) { 87 // Update buffers used 88 pre /= 1; 89 const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); 90 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); 91 92 // Build Chebyshev polynomial values 93 ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); 94 95 // Contract along middle index 96 for (CeedInt a = 0; a < pre; a++) { 97 for (CeedInt c = 0; c < post; c++) { 98 if (d == BASIS_DIM - 1) { 99 for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 100 } else { 101 for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 102 } 103 } 104 } 105 post *= Q; 106 } 107 } 108 109 // Map from coefficients 110 pre = BASIS_NUM_QPTS; 111 post = 1; 112 for (CeedInt d = 0; d < BASIS_DIM; d++) { 113 __syncthreads(); 114 // Update buffers used 115 pre /= Q; 116 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 117 CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 118 const CeedInt writeLen = pre * post * P; 119 120 // Contract along middle index 121 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 122 const CeedInt c = k % post; 123 const CeedInt j = (k / post) % P; 124 const CeedInt a = k / (post * P); 125 CeedScalar v_k = 0; 126 127 for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 128 if (d == BASIS_DIM - 1) out[k] += v_k; 129 else out[k] = v_k; 130 } 131 post *= P; 132 } 133 } 134 } 135 } else { 136 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 137 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 138 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 139 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 140 CeedInt pre = u_size; 141 CeedInt post = 1; 142 143 // Map to coefficients 144 for (CeedInt d = 0; d < BASIS_DIM; d++) { 145 __syncthreads(); 146 // Update buffers used 147 pre /= P; 148 const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 149 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 150 const CeedInt writeLen = pre * post * Q; 151 152 // Contract along middle index 153 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 154 const CeedInt c = k % post; 155 const CeedInt j = (k / post) % Q; 156 const CeedInt a = k / (post * Q); 157 CeedScalar v_k = 0; 158 159 for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 160 out[k] = v_k; 161 } 162 post *= Q; 163 } 164 165 // Map to point 166 __syncthreads(); 167 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 168 pre = BASIS_NUM_QPTS; 169 post = 1; 170 for (CeedInt d = 0; d < BASIS_DIM; d++) { 171 // Update buffers used 172 pre /= Q; 173 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); 174 CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); 175 176 // Build Chebyshev polynomial values 177 ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); 178 179 // Contract along middle index 180 for (CeedInt a = 0; a < pre; a++) { 181 for (CeedInt c = 0; c < post; c++) { 182 CeedScalar v_k = 0; 183 184 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 185 out[a * post + c] = v_k; 186 } 187 } 188 post *= 1; 189 } 190 } 191 } 192 } 193 } 194 } 195 196 //------------------------------------------------------------------------------ 197 // Grad 198 //------------------------------------------------------------------------------ 199 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 200 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 201 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 202 const CeedInt i = threadIdx.x; 203 204 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 205 CeedScalar *s_chebyshev_interp_1d = s_mem; 206 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 207 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 208 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 209 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 210 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 211 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 212 } 213 214 const CeedInt P = BASIS_P_1D; 215 const CeedInt Q = BASIS_Q_1D; 216 const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 217 const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 218 const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 219 const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 220 const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 221 const CeedInt u_dim_stride = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0; 222 const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 223 224 // Apply basis element by element 225 if (is_transpose) { 226 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 227 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 228 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 229 CeedInt pre = 1; 230 CeedInt post = 1; 231 232 // Clear Chebyshev coeffs 233 for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 234 s_chebyshev_coeffs[k] = 0.0; 235 } 236 237 // Map from point 238 __syncthreads(); 239 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 240 if (p >= points_per_elem[elem]) continue; 241 for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 242 const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; 243 244 pre = 1; 245 post = 1; 246 for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 247 // Update buffers used 248 pre /= 1; 249 const CeedScalar *in = dim_2 == 0 ? (&cur_u[p]) : (dim_2 % 2 ? buffer_2 : buffer_1); 250 CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); 251 252 // Build Chebyshev polynomial values 253 if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 254 else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 255 256 // Contract along middle index 257 for (CeedInt a = 0; a < pre; a++) { 258 for (CeedInt c = 0; c < post; c++) { 259 if (dim_2 == BASIS_DIM - 1) { 260 for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 261 } else { 262 for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 263 } 264 } 265 } 266 post *= Q; 267 } 268 } 269 } 270 271 // Map from coefficients 272 pre = BASIS_NUM_QPTS; 273 post = 1; 274 for (CeedInt d = 0; d < BASIS_DIM; d++) { 275 __syncthreads(); 276 // Update buffers used 277 pre /= Q; 278 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 279 CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 280 const CeedInt writeLen = pre * post * P; 281 282 // Contract along middle index 283 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 284 const CeedInt c = k % post; 285 const CeedInt j = (k / post) % P; 286 const CeedInt a = k / (post * P); 287 CeedScalar v_k = 0; 288 289 for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 290 if (d == BASIS_DIM - 1) out[k] += v_k; 291 else out[k] = v_k; 292 } 293 post *= P; 294 } 295 } 296 } 297 } else { 298 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 299 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 300 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 301 CeedInt pre = u_size; 302 CeedInt post = 1; 303 304 // Map to coefficients 305 for (CeedInt d = 0; d < BASIS_DIM; d++) { 306 __syncthreads(); 307 // Update buffers used 308 pre /= P; 309 const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 310 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 311 const CeedInt writeLen = pre * post * Q; 312 313 // Contract along middle index 314 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 315 const CeedInt c = k % post; 316 const CeedInt j = (k / post) % Q; 317 const CeedInt a = k / (post * Q); 318 CeedScalar v_k = 0; 319 320 for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 321 out[k] = v_k; 322 } 323 post *= Q; 324 } 325 326 // Map to point 327 __syncthreads(); 328 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 329 for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 330 CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; 331 332 pre = BASIS_NUM_QPTS; 333 post = 1; 334 for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 335 // Update buffers used 336 pre /= Q; 337 const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); 338 CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (&cur_v[p]) : (dim_2 % 2 ? buffer_1 : buffer_2); 339 340 // Build Chebyshev polynomial values 341 if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 342 else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 343 344 // Contract along middle index 345 for (CeedInt a = 0; a < pre; a++) { 346 for (CeedInt c = 0; c < post; c++) { 347 CeedScalar v_k = 0; 348 349 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 350 out[a * post + c] = v_k; 351 } 352 } 353 post *= 1; 354 } 355 } 356 } 357 } 358 } 359 } 360 } 361