1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis AtPoints templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Chebyshev values 14 //------------------------------------------------------------------------------ 15 template <int Q_1D> 16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 17 chebyshev_x[0] = 1.0; 18 chebyshev_x[1] = 2 * x; 19 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 20 } 21 22 template <int Q_1D> 23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 24 CeedScalar chebyshev_x[3]; 25 26 chebyshev_x[1] = 1.0; 27 chebyshev_x[2] = 2 * x; 28 chebyshev_dx[0] = 0.0; 29 chebyshev_dx[1] = 2.0; 30 for (CeedInt i = 2; i < Q_1D; i++) { 31 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 32 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 33 } 34 } 35 36 //------------------------------------------------------------------------------ 37 // 1D 38 //------------------------------------------------------------------------------ 39 40 //------------------------------------------------------------------------------ 41 // 1D interpolate to points 42 //------------------------------------------------------------------------------ 43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 44 inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 45 CeedScalar *__restrict__ r_V) { 46 CeedScalar chebyshev_x[Q_1D]; 47 48 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 49 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 50 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 51 // Load coefficients 52 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 53 __syncthreads(); 54 // Contract x direction 55 for (CeedInt i = 0; i < Q_1D; i++) { 56 r_V[comp] += chebyshev_x[i] * data.slice[i]; 57 } 58 } 59 } 60 61 //------------------------------------------------------------------------------ 62 // 1D interpolate transpose 63 //------------------------------------------------------------------------------ 64 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 65 inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 66 CeedScalar *__restrict__ r_C) { 67 CeedScalar chebyshev_x[Q_1D]; 68 69 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 70 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 71 // Clear shared memory 72 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 73 __syncthreads(); 74 // Contract x direction 75 if (p < NUM_POINTS) { 76 for (CeedInt i = 0; i < Q_1D; i++) { 77 atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 78 } 79 } 80 // Pull from shared to register 81 __syncthreads(); 82 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 83 } 84 } 85 86 //------------------------------------------------------------------------------ 87 // 1D derivatives at points 88 //------------------------------------------------------------------------------ 89 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 90 inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 91 CeedScalar *__restrict__ r_V) { 92 CeedScalar chebyshev_x[Q_1D]; 93 94 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 95 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 96 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 97 // Load coefficients 98 __syncthreads(); 99 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 100 __syncthreads(); 101 // Contract x direction 102 for (CeedInt i = 0; i < Q_1D; i++) { 103 r_V[comp] += chebyshev_x[i] * data.slice[i]; 104 } 105 } 106 } 107 108 //------------------------------------------------------------------------------ 109 // 1D derivatives transpose 110 //------------------------------------------------------------------------------ 111 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 112 inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 113 CeedScalar *__restrict__ r_C) { 114 CeedScalar chebyshev_x[Q_1D]; 115 116 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 117 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 118 // Clear shared memory 119 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 120 __syncthreads(); 121 // Contract x direction 122 if (p < NUM_POINTS) { 123 for (CeedInt i = 0; i < Q_1D; i++) { 124 atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 125 } 126 } 127 // Pull from shared to register 128 __syncthreads(); 129 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 130 } 131 } 132 133 //------------------------------------------------------------------------------ 134 // 2D 135 //------------------------------------------------------------------------------ 136 137 //------------------------------------------------------------------------------ 138 // 2D interpolate to points 139 //------------------------------------------------------------------------------ 140 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 141 inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 142 CeedScalar *__restrict__ r_V) { 143 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 144 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 145 CeedScalar buffer[Q_1D]; 146 CeedScalar chebyshev_x[Q_1D]; 147 148 // Load coefficients 149 __syncthreads(); 150 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 151 __syncthreads(); 152 // Contract x direction 153 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 154 for (CeedInt i = 0; i < Q_1D; i++) { 155 buffer[i] = 0.0; 156 for (CeedInt j = 0; j < Q_1D; j++) { 157 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 158 } 159 } 160 // Contract y direction 161 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 162 for (CeedInt i = 0; i < Q_1D; i++) { 163 r_V[comp] += chebyshev_x[i] * buffer[i]; 164 } 165 } 166 } 167 168 //------------------------------------------------------------------------------ 169 // 2D interpolate transpose 170 //------------------------------------------------------------------------------ 171 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 172 inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 173 CeedScalar *__restrict__ r_C) { 174 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 175 CeedScalar buffer[Q_1D]; 176 CeedScalar chebyshev_x[Q_1D]; 177 178 // Clear shared memory 179 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 180 __syncthreads(); 181 // Contract y direction 182 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 183 for (CeedInt i = 0; i < Q_1D; i++) { 184 buffer[i] = chebyshev_x[i] * r_U[comp]; 185 } 186 // Contract x direction 187 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 188 if (p < NUM_POINTS) { 189 for (CeedInt i = 0; i < Q_1D; i++) { 190 // Note: shifting to avoid atomic adds 191 const CeedInt ii = (i + data.t_id_x) % Q_1D; 192 193 for (CeedInt j = 0; j < Q_1D; j++) { 194 const CeedInt jj = (j + data.t_id_y) % Q_1D; 195 196 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 197 } 198 } 199 } 200 // Pull from shared to register 201 __syncthreads(); 202 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 203 } 204 } 205 206 //------------------------------------------------------------------------------ 207 // 2D derivatives at points 208 //------------------------------------------------------------------------------ 209 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 210 inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 211 CeedScalar *__restrict__ r_V) { 212 for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; 213 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 214 CeedScalar buffer[Q_1D]; 215 CeedScalar chebyshev_x[Q_1D]; 216 217 // Load coefficients 218 __syncthreads(); 219 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 220 __syncthreads(); 221 for (CeedInt dim = 0; dim < 2; dim++) { 222 // Contract x direction 223 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 224 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 225 for (CeedInt i = 0; i < Q_1D; i++) { 226 buffer[i] = 0.0; 227 for (CeedInt j = 0; j < Q_1D; j++) { 228 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 229 } 230 } 231 // Contract y direction 232 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 233 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 234 for (CeedInt i = 0; i < Q_1D; i++) { 235 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; 236 } 237 } 238 } 239 } 240 241 //------------------------------------------------------------------------------ 242 // 2D derivatives transpose 243 //------------------------------------------------------------------------------ 244 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 245 inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 246 CeedScalar *__restrict__ r_C) { 247 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 248 CeedScalar buffer[Q_1D]; 249 CeedScalar chebyshev_x[Q_1D]; 250 251 // Clear shared memory 252 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 253 __syncthreads(); 254 for (CeedInt dim = 0; dim < 2; dim++) { 255 // Contract y direction 256 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 257 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 258 for (CeedInt i = 0; i < Q_1D; i++) { 259 buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP]; 260 } 261 // Contract x direction 262 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 263 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 264 if (p < NUM_POINTS) { 265 for (CeedInt i = 0; i < Q_1D; i++) { 266 // Note: shifting to avoid atomic adds 267 const CeedInt ii = (i + data.t_id_x) % Q_1D; 268 269 for (CeedInt j = 0; j < Q_1D; j++) { 270 const CeedInt jj = (j + data.t_id_y) % Q_1D; 271 272 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 273 } 274 } 275 } 276 } 277 // Pull from shared to register 278 __syncthreads(); 279 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 280 } 281 } 282 283 //------------------------------------------------------------------------------ 284 // 3D 285 //------------------------------------------------------------------------------ 286 287 //------------------------------------------------------------------------------ 288 // 3D interpolate to points 289 //------------------------------------------------------------------------------ 290 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 291 inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 292 CeedScalar *__restrict__ r_V) { 293 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 294 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 295 for (CeedInt k = 0; k < Q_1D; k++) { 296 CeedScalar buffer[Q_1D]; 297 CeedScalar chebyshev_x[Q_1D]; 298 299 // Load coefficients 300 __syncthreads(); 301 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 302 __syncthreads(); 303 // Contract x direction 304 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 305 for (CeedInt i = 0; i < Q_1D; i++) { 306 buffer[i] = 0.0; 307 for (CeedInt j = 0; j < Q_1D; j++) { 308 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 309 } 310 } 311 // Contract y and z direction 312 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 313 const CeedScalar z = chebyshev_x[k]; 314 315 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 316 for (CeedInt i = 0; i < Q_1D; i++) { 317 r_V[comp] += chebyshev_x[i] * buffer[i] * z; 318 } 319 } 320 } 321 } 322 323 //------------------------------------------------------------------------------ 324 // 3D interpolate transpose 325 //------------------------------------------------------------------------------ 326 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 327 inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 328 CeedScalar *__restrict__ r_C) { 329 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 330 for (CeedInt k = 0; k < Q_1D; k++) { 331 CeedScalar buffer[Q_1D]; 332 CeedScalar chebyshev_x[Q_1D]; 333 334 // Clear shared memory 335 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 336 __syncthreads(); 337 // Contract y and z direction 338 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 339 const CeedScalar z = chebyshev_x[k]; 340 341 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 342 for (CeedInt i = 0; i < Q_1D; i++) { 343 buffer[i] = chebyshev_x[i] * r_U[comp] * z; 344 } 345 // Contract x direction 346 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 347 if (p < NUM_POINTS) { 348 for (CeedInt i = 0; i < Q_1D; i++) { 349 // Note: shifting to avoid atomic adds 350 const CeedInt ii = (i + data.t_id_x) % Q_1D; 351 352 for (CeedInt j = 0; j < Q_1D; j++) { 353 const CeedInt jj = (j + data.t_id_y) % Q_1D; 354 355 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 356 } 357 } 358 } 359 // Pull from shared to register 360 __syncthreads(); 361 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 362 } 363 } 364 } 365 366 //------------------------------------------------------------------------------ 367 // 3D derivatives at points 368 //------------------------------------------------------------------------------ 369 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 370 inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 371 CeedScalar *__restrict__ r_V) { 372 for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; 373 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 374 for (CeedInt k = 0; k < Q_1D; k++) { 375 CeedScalar buffer[Q_1D]; 376 CeedScalar chebyshev_x[Q_1D]; 377 378 // Load coefficients 379 __syncthreads(); 380 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 381 __syncthreads(); 382 for (CeedInt dim = 0; dim < 3; dim++) { 383 // Contract x direction 384 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 385 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 386 for (CeedInt i = 0; i < Q_1D; i++) { 387 buffer[i] = 0.0; 388 for (CeedInt j = 0; j < Q_1D; j++) { 389 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 390 } 391 } 392 // Contract y and z direction 393 if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 394 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 395 const CeedScalar z = chebyshev_x[k]; 396 397 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 398 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 399 for (CeedInt i = 0; i < Q_1D; i++) { 400 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z; 401 } 402 } 403 } 404 } 405 } 406 407 //------------------------------------------------------------------------------ 408 // 3D derivatives transpose 409 //------------------------------------------------------------------------------ 410 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 411 inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 412 CeedScalar *__restrict__ r_C) { 413 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 414 for (CeedInt k = 0; k < Q_1D; k++) { 415 CeedScalar buffer[Q_1D]; 416 CeedScalar chebyshev_x[Q_1D]; 417 418 // Clear shared memory 419 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 420 __syncthreads(); 421 for (CeedInt dim = 0; dim < 3; dim++) { 422 // Contract y and z direction 423 if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 424 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 425 const CeedScalar z = chebyshev_x[k]; 426 427 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 428 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 429 for (CeedInt i = 0; i < Q_1D; i++) { 430 buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z; 431 } 432 // Contract x direction 433 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 434 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 435 if (p < NUM_POINTS) { 436 for (CeedInt i = 0; i < Q_1D; i++) { 437 // Note: shifting to avoid atomic adds 438 const CeedInt ii = (i + data.t_id_x) % Q_1D; 439 440 for (CeedInt j = 0; j < Q_1D; j++) { 441 const CeedInt jj = (j + data.t_id_y) % Q_1D; 442 443 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 444 } 445 } 446 } 447 } 448 // Pull from shared to register 449 __syncthreads(); 450 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 451 } 452 } 453 } 454