1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA tensor product basis with AtPoints evaluation 10 11 #include <ceed.h> 12 13 //------------------------------------------------------------------------------ 14 // Chebyshev values 15 //------------------------------------------------------------------------------ 16 template <int Q_1D> 17 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 18 chebyshev_x[0] = 1.0; 19 chebyshev_x[1] = 2 * x; 20 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 21 } 22 23 template <int Q_1D> 24 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 25 CeedScalar chebyshev_x[3]; 26 27 chebyshev_x[1] = 1.0; 28 chebyshev_x[2] = 2 * x; 29 chebyshev_dx[0] = 0.0; 30 chebyshev_dx[1] = 2.0; 31 for (CeedInt i = 2; i < Q_1D; i++) { 32 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 33 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 34 } 35 } 36 37 //------------------------------------------------------------------------------ 38 // Tensor Basis Kernels AtPoints 39 //------------------------------------------------------------------------------ 40 41 //------------------------------------------------------------------------------ 42 // Interp 43 //------------------------------------------------------------------------------ 44 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 45 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 46 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 47 const CeedInt i = threadIdx.x; 48 49 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 50 CeedScalar *s_chebyshev_interp_1d = s_mem; 51 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 52 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 53 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 54 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 55 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 56 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 57 } 58 59 const CeedInt P = BASIS_P_1D; 60 const CeedInt Q = BASIS_Q_1D; 61 const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 62 const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 63 const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 64 const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 65 const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 66 67 // Apply basis element by element 68 if (is_transpose) { 69 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 70 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 71 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 72 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 73 CeedInt pre = 1; 74 CeedInt post = 1; 75 76 // Clear Chebyshev coeffs 77 for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 78 s_chebyshev_coeffs[k] = 0.0; 79 } 80 81 // Map from point 82 __syncthreads(); 83 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 84 if (p >= points_per_elem[elem]) continue; 85 pre = 1; 86 post = 1; 87 for (CeedInt d = 0; d < BASIS_DIM; d++) { 88 // Update buffers used 89 pre /= 1; 90 const CeedScalar *in = d == 0 ? (&cur_u[p]) : (d % 2 ? buffer_2 : buffer_1); 91 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? buffer_1 : buffer_2); 92 93 // Build Chebyshev polynomial values 94 ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + d * u_comp_stride + p], chebyshev_x); 95 96 // Contract along middle index 97 for (CeedInt a = 0; a < pre; a++) { 98 for (CeedInt c = 0; c < post; c++) { 99 if (d == BASIS_DIM - 1) { 100 for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 101 } else { 102 for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 103 } 104 } 105 } 106 post *= Q; 107 } 108 } 109 110 // Map from coefficients 111 pre = BASIS_NUM_QPTS; 112 post = 1; 113 for (CeedInt d = 0; d < BASIS_DIM; d++) { 114 __syncthreads(); 115 // Update buffers used 116 pre /= Q; 117 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 118 CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 119 const CeedInt writeLen = pre * post * P; 120 121 // Contract along middle index 122 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 123 const CeedInt c = k % post; 124 const CeedInt j = (k / post) % P; 125 const CeedInt a = k / (post * P); 126 CeedScalar v_k = 0; 127 128 for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 129 if (d == BASIS_DIM - 1) out[k] += v_k; 130 else out[k] = v_k; 131 } 132 post *= P; 133 } 134 } 135 } 136 } else { 137 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 138 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 139 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 140 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 141 CeedInt pre = u_size; 142 CeedInt post = 1; 143 144 // Map to coefficients 145 for (CeedInt d = 0; d < BASIS_DIM; d++) { 146 __syncthreads(); 147 // Update buffers used 148 pre /= P; 149 const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 150 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 151 const CeedInt writeLen = pre * post * Q; 152 153 // Contract along middle index 154 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 155 const CeedInt c = k % post; 156 const CeedInt j = (k / post) % Q; 157 const CeedInt a = k / (post * Q); 158 CeedScalar v_k = 0; 159 160 for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 161 out[k] = v_k; 162 } 163 post *= Q; 164 } 165 166 // Map to point 167 __syncthreads(); 168 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 169 pre = BASIS_NUM_QPTS; 170 post = 1; 171 for (CeedInt d = 0; d < BASIS_DIM; d++) { 172 // Update buffers used 173 pre /= Q; 174 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? buffer_2 : buffer_1); 175 CeedScalar *out = d == BASIS_DIM - 1 ? (&cur_v[p]) : (d % 2 ? buffer_1 : buffer_2); 176 177 // Build Chebyshev polynomial values 178 ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + d * v_comp_stride + p], chebyshev_x); 179 180 // Contract along middle index 181 for (CeedInt a = 0; a < pre; a++) { 182 for (CeedInt c = 0; c < post; c++) { 183 CeedScalar v_k = 0; 184 185 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 186 out[a * post + c] = v_k; 187 } 188 } 189 post *= 1; 190 } 191 } 192 } 193 } 194 } 195 } 196 197 //------------------------------------------------------------------------------ 198 // Grad 199 //------------------------------------------------------------------------------ 200 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedInt is_transpose, const CeedScalar *__restrict__ chebyshev_interp_1d, 201 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ coords, 202 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 203 const CeedInt i = threadIdx.x; 204 205 __shared__ CeedScalar s_mem[BASIS_Q_1D * BASIS_P_1D + 2 * BASIS_BUF_LEN + POINTS_BUFF_LEN * BASIS_Q_1D]; 206 CeedScalar *s_chebyshev_interp_1d = s_mem; 207 CeedScalar *s_buffer_1 = s_mem + BASIS_Q_1D * BASIS_P_1D; 208 CeedScalar *s_buffer_2 = s_buffer_1 + BASIS_BUF_LEN; 209 CeedScalar *s_chebyshev_coeffs = s_buffer_2 + BASIS_BUF_LEN; 210 CeedScalar chebyshev_x[BASIS_Q_1D], buffer_1[POINTS_BUFF_LEN], buffer_2[POINTS_BUFF_LEN]; 211 for (CeedInt k = i; k < BASIS_Q_1D * BASIS_P_1D; k += blockDim.x) { 212 s_chebyshev_interp_1d[k] = chebyshev_interp_1d[k]; 213 } 214 215 const CeedInt P = BASIS_P_1D; 216 const CeedInt Q = BASIS_Q_1D; 217 const CeedInt u_stride = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 218 const CeedInt v_stride = is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS; 219 const CeedInt u_comp_stride = num_elem * (is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES); 220 const CeedInt v_comp_stride = num_elem * (is_transpose ? BASIS_NUM_NODES : BASIS_NUM_PTS); 221 const CeedInt u_size = is_transpose ? BASIS_NUM_PTS : BASIS_NUM_NODES; 222 const CeedInt u_dim_stride = is_transpose ? num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP : 0; 223 const CeedInt v_dim_stride = is_transpose ? 0 : num_elem * BASIS_NUM_PTS * BASIS_NUM_COMP; 224 225 // Apply basis element by element 226 if (is_transpose) { 227 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 228 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 229 CeedScalar *cur_v = &v[elem * v_stride + comp * v_comp_stride]; 230 CeedInt pre = 1; 231 CeedInt post = 1; 232 233 // Clear Chebyshev coeffs 234 for (CeedInt k = i; k < BASIS_NUM_QPTS; k += blockDim.x) { 235 s_chebyshev_coeffs[k] = 0.0; 236 } 237 238 // Map from point 239 __syncthreads(); 240 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 241 if (p >= points_per_elem[elem]) continue; 242 for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 243 const CeedScalar *cur_u = &u[elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride]; 244 245 pre = 1; 246 post = 1; 247 for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 248 // Update buffers used 249 pre /= 1; 250 const CeedScalar *in = dim_2 == 0 ? (cur_u + p) : (dim_2 % 2 ? buffer_2 : buffer_1); 251 CeedScalar *out = dim_2 == BASIS_DIM - 1 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_1 : buffer_2); 252 253 // Build Chebyshev polynomial values 254 if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 255 else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * u_stride + dim_2 * u_comp_stride + p], chebyshev_x); 256 257 // Contract along middle index 258 for (CeedInt a = 0; a < pre; a++) { 259 for (CeedInt c = 0; c < post; c++) { 260 if (dim_2 == BASIS_DIM - 1) { 261 for (CeedInt j = 0; j < Q; j++) atomicAdd(&out[(a * Q + (j + p) % Q) * post + c], chebyshev_x[(j + p) % Q] * in[a * post + c]); 262 } else { 263 for (CeedInt j = 0; j < Q; j++) out[(a * Q + j) * post + c] = chebyshev_x[j] * in[a * post + c]; 264 } 265 } 266 } 267 post *= Q; 268 } 269 } 270 } 271 272 // Map from coefficients 273 pre = BASIS_NUM_QPTS; 274 post = 1; 275 for (CeedInt d = 0; d < BASIS_DIM; d++) { 276 __syncthreads(); 277 // Update buffers used 278 pre /= Q; 279 const CeedScalar *in = d == 0 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_2 : s_buffer_1); 280 CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buffer_1 : s_buffer_2); 281 const CeedInt writeLen = pre * post * P; 282 283 // Contract along middle index 284 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 285 const CeedInt c = k % post; 286 const CeedInt j = (k / post) % P; 287 const CeedInt a = k / (post * P); 288 CeedScalar v_k = 0; 289 290 for (CeedInt b = 0; b < Q; b++) v_k += s_chebyshev_interp_1d[j + b * BASIS_P_1D] * in[(a * Q + b) * post + c]; 291 if (d == BASIS_DIM - 1) out[k] += v_k; 292 else out[k] = v_k; 293 } 294 post *= P; 295 } 296 } 297 } 298 } else { 299 for (CeedInt elem = blockIdx.x; elem < num_elem; elem += gridDim.x) { 300 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 301 const CeedScalar *cur_u = &u[elem * u_stride + comp * u_comp_stride]; 302 CeedInt pre = u_size; 303 CeedInt post = 1; 304 305 // Map to coefficients 306 for (CeedInt d = 0; d < BASIS_DIM; d++) { 307 __syncthreads(); 308 // Update buffers used 309 pre /= P; 310 const CeedScalar *in = d == 0 ? cur_u : (d % 2 ? s_buffer_2 : s_buffer_1); 311 CeedScalar *out = d == BASIS_DIM - 1 ? s_chebyshev_coeffs : (d % 2 ? s_buffer_1 : s_buffer_2); 312 const CeedInt writeLen = pre * post * Q; 313 314 // Contract along middle index 315 for (CeedInt k = i; k < writeLen; k += blockDim.x) { 316 const CeedInt c = k % post; 317 const CeedInt j = (k / post) % Q; 318 const CeedInt a = k / (post * Q); 319 CeedScalar v_k = 0; 320 321 for (CeedInt b = 0; b < P; b++) v_k += s_chebyshev_interp_1d[j * BASIS_P_1D + b] * in[(a * P + b) * post + c]; 322 out[k] = v_k; 323 } 324 post *= Q; 325 } 326 327 // Map to point 328 __syncthreads(); 329 for (CeedInt p = threadIdx.x; p < BASIS_NUM_PTS; p += blockDim.x) { 330 for (CeedInt dim_1 = 0; dim_1 < BASIS_DIM; dim_1++) { 331 CeedScalar *cur_v = &v[elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride]; 332 333 pre = BASIS_NUM_QPTS; 334 post = 1; 335 for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { 336 // Update buffers used 337 pre /= Q; 338 const CeedScalar *in = dim_2 == 0 ? s_chebyshev_coeffs : (dim_2 % 2 ? buffer_2 : buffer_1); 339 CeedScalar *out = dim_2 == BASIS_DIM - 1 ? (cur_v + p) : (dim_2 % 2 ? buffer_1 : buffer_2); 340 341 // Build Chebyshev polynomial values 342 if (dim_1 == dim_2) ChebyshevDerivativeAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 343 else ChebyshevPolynomialsAtPoint<BASIS_Q_1D>(coords[elem * v_stride + dim_2 * v_comp_stride + p], chebyshev_x); 344 345 // Contract along middle index 346 for (CeedInt a = 0; a < pre; a++) { 347 for (CeedInt c = 0; c < post; c++) { 348 CeedScalar v_k = 0; 349 350 for (CeedInt b = 0; b < Q; b++) v_k += chebyshev_x[b] * in[(a * Q + b) * post + c]; 351 out[a * post + c] = v_k; 352 } 353 } 354 post *= 1; 355 } 356 } 357 } 358 } 359 } 360 } 361 } 362