1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA tensor product basis with AtPoints evaluation 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Chebyshev values 14 //------------------------------------------------------------------------------ 15 template <int Q_1D> 16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 17 chebyshev_x[0] = 1.0; 18 chebyshev_x[1] = 2 * x; 19 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 20 } 21 22 template <int Q_1D> 23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 24 CeedScalar chebyshev_x[3]; 25 26 chebyshev_x[1] = 1.0; 27 chebyshev_x[2] = 2 * x; 28 chebyshev_dx[0] = 0.0; 29 chebyshev_dx[1] = 2.0; 30 for (CeedInt i = 2; i < Q_1D; i++) { 31 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 32 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 33 } 34 } 35 36 //------------------------------------------------------------------------------ 37 // Tensor Basis Kernels AtPoints 38 //------------------------------------------------------------------------------ 39 40 //------------------------------------------------------------------------------ 41 // Interp 42 //------------------------------------------------------------------------------ 43 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 44 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 45 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 46 const CeedInt i = threadIdx.x; 47 48 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 49 CeedScalar *s_chebyshev_interp_1d = s_mem; 50 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 51 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 52 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 53 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 54 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 55 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 56 } 57 58 const CeedInt P = BASIS_P_1D; 59 const CeedInt Q = BASIS_Q_1D; 60 const CeedInt u_stride = BASIS_NUM_NODES; 61 const CeedInt v_stride = BASIS_NUM_PTS; 62 const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; 63 const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; 64 const CeedInt u_size = BASIS_NUM_NODES; 65 66 // Apply basis element by element 67 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 68 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 69 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 70 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 71 CeedInt pre = u_size; 72 CeedInt post = 1; 73 74 // Map to coefficients 75 for (CeedInt d = 0; d < BASIS_DIM; d++) { 76 __syncthreads(); 77 // Update buffers used 78 pre /= P; 79 const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 80 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 81 const CeedInt writeLen = pre * post * Q; 82 83 // Contract along middle index 84 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 85 const CeedInt c = k % post; 86 const CeedInt j = (k / post) % Q; 87 const CeedInt a = k / (post * Q); 88 CeedScalar v_k = 0; 89 90 for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 91 out[k] = v_k; 92 } 93 post *= Q; 94 } 95 96 // Map to point 97 __syncthreads(); 98 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 99 pre = BASIS_NUM_QPTS; 100 post = 1; 101 for (CeedInt d = 0; d < BASIS_DIM; d++) { 102 // Update buffers used 103 pre /= Q; 104 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); 105 CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); 106 107 // Build Chebyshev polynomial values 108 ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); 109 110 // Contract along middle index 111 for (CeedInt a = 0; a < pre; a++) { 112 for (CeedInt c = 0; c < post; c++) { 113 CeedScalar v_k = 0; 114 115 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 116 out[a * post + c] = v_k; 117 } 118 } 119 post *= 1; 120 } 121 } 122 } 123 } 124 } 125 126 extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 127 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 128 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 129 const CeedInt i = threadIdx.x; 130 131 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 132 CeedScalar *s_chebyshev_interp_1d = s_mem; 133 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 134 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 135 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 136 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 137 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 138 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 139 } 140 141 const CeedInt P = BASIS_P_1D; 142 const CeedInt Q = BASIS_Q_1D; 143 const CeedInt u_stride = BASIS_NUM_PTS; 144 const CeedInt v_stride = BASIS_NUM_NODES; 145 const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; 146 const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; 147 const CeedInt u_size = BASIS_NUM_PTS; 148 149 // Apply basis element by element 150 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 151 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 152 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 153 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 154 CeedInt pre = 1; 155 CeedInt post = 1; 156 157 // Clear Chebyshev coeffs 158 for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 159 s_chebyshev_coeffs[k] = 0.0; 160 } 161 162 // Map from point 163 __syncthreads(); 164 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 165 if (p >= points_per_elem[elem]) continue; 166 pre = 1; 167 post = 1; 168 for (CeedInt d = 0; d < BASIS_DIM; d++) { 169 // Update buffers used 170 pre /= 1; 171 const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); 172 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); 173 174 // Build Chebyshev polynomial values 175 ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); 176 177 // Contract along middle index 178 for (CeedInt a = 0; a < pre; a++) { 179 for (CeedInt c = 0; c < post; c++) { 180 if (d == BASIS_DIM - 1) { 181 for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 182 } else { 183 for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 184 } 185 } 186 } 187 post *= Q; 188 } 189 } 190 191 // Map from coefficients 192 pre = BASIS_NUM_QPTS; 193 post = 1; 194 for (CeedInt d = 0; d < BASIS_DIM; d++) { 195 __syncthreads(); 196 // Update buffers used 197 pre /= Q; 198 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 199 CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 200 const CeedInt writeLen = pre * post * P; 201 202 // Contract along middle index 203 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 204 const CeedInt c = k % post; 205 const CeedInt j = (k / post) % P; 206 const CeedInt a = k / (post * P); 207 CeedScalar v_k = 0; 208 209 for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 210 if (d == BASIS_DIM - 1) out[k] += v_k; 211 else out[k] = v_k; 212 } 213 post *= P; 214 } 215 } 216 } 217 } 218 219 //------------------------------------------------------------------------------ 220 // Grad 221 //------------------------------------------------------------------------------ 222 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 223 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 224 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 225 const CeedInt i = threadIdx.x; 226 227 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 228 CeedScalar *s_chebyshev_interp_1d = s_mem; 229 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 230 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 231 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 232 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 233 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 234 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 235 } 236 237 const CeedInt P = BASIS_P_1D; 238 const CeedInt Q = BASIS_Q_1D; 239 const CeedInt u_stride = BASIS_NUM_NODES; 240 const CeedInt v_stride = BASIS_NUM_PTS; 241 const CeedInt u_comp_stride = num_elem * BASIS_NUM_NODES; 242 const CeedInt v_comp_stride = num_elem * BASIS_NUM_PTS; 243 const CeedInt u_size = BASIS_NUM_NODES; 244 const CeedInt u_dim_stride = 0; 245 const CeedInt v_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 246 247 // Apply basis element by element 248 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 249 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 250 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 251 CeedInt pre = u_size; 252 CeedInt post = 1; 253 254 // Map to coefficients 255 for (CeedInt d = 0; d < BASIS_DIM; d++) { 256 __syncthreads(); 257 // Update buffers used 258 pre /= P; 259 const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 260 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 261 const CeedInt writeLen = pre * post * Q; 262 263 // Contract along middle index 264 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 265 const CeedInt c = k % post; 266 const CeedInt j = (k / post) % Q; 267 const CeedInt a = k / (post * Q); 268 CeedScalar v_k = 0; 269 270 for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 271 out[k] = v_k; 272 } 273 post *= Q; 274 } 275 276 // Map to point 277 __syncthreads(); 278 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 279 for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 280 CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; 281 282 pre = BASIS_NUM_QPTS; 283 post = 1; 284 for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 285 // Update buffers used 286 pre /= Q; 287 const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); 288 CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); 289 290 // Build Chebyshev polynomial values 291 if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 292 else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 293 294 // Contract along middle index 295 for (CeedInt a = 0; a < pre; a++) { 296 for (CeedInt c = 0; c < post; c++) { 297 CeedScalar v_k = 0; 298 299 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 300 out[a * post + c] = v_k; 301 } 302 } 303 post *= 1; 304 } 305 } 306 } 307 } 308 } 309 } 310 311 extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ chebyshev_interp_1d, 312 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 313 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 314 const CeedInt i = threadIdx.x; 315 316 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 317 CeedScalar *s_chebyshev_interp_1d = s_mem; 318 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 319 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 320 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 321 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 322 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 323 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 324 } 325 326 const CeedInt P = BASIS_P_1D; 327 const CeedInt Q = BASIS_Q_1D; 328 const CeedInt u_stride = BASIS_NUM_PTS; 329 const CeedInt v_stride = BASIS_NUM_NODES; 330 const CeedInt u_comp_stride = num_elem * BASIS_NUM_PTS; 331 const CeedInt v_comp_stride = num_elem * BASIS_NUM_NODES; 332 const CeedInt u_size = BASIS_NUM_PTS; 333 const CeedInt u_dim_stride = num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 334 const CeedInt v_dim_stride = 0; 335 336 // Apply basis element by element 337 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 338 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 339 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 340 CeedInt pre = 1; 341 CeedInt post = 1; 342 343 // Clear Chebyshev coeffs 344 for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 345 s_chebyshev_coeffs[k] = 0.0; 346 } 347 348 // Map from point 349 __syncthreads(); 350 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 351 if (p >= points_per_elem[elem]) continue; 352 for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 353 const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; 354 355 pre = 1; 356 post = 1; 357 for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 358 // Update buffers used 359 pre /= 1; 360 const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); 361 CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); 362 363 // Build Chebyshev polynomial values 364 if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 365 else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 366 367 // Contract along middle index 368 for (CeedInt a = 0; a < pre; a++) { 369 for (CeedInt c = 0; c < post; c++) { 370 if (dim_2 == BASIS_DIM - 1) { 371 for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 372 } else { 373 for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 374 } 375 } 376 } 377 post *= Q; 378 } 379 } 380 } 381 382 // Map from coefficients 383 pre = BASIS_NUM_QPTS; 384 post = 1; 385 for (CeedInt d = 0; d < BASIS_DIM; d++) { 386 __syncthreads(); 387 // Update buffers used 388 pre /= Q; 389 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 390 CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 391 const CeedInt writeLen = pre * post * P; 392 393 // Contract along middle index 394 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 395 const CeedInt c = k % post; 396 const CeedInt j = (k / post) % P; 397 const CeedInt a = k / (post * P); 398 CeedScalar v_k = 0; 399 400 for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 401 if (d == BASIS_DIM - 1) out[k] += v_k; 402 else out[k] = v_k; 403 } 404 post *= P; 405 } 406 } 407 } 408 } 409