1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis AtPoints templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Chebyshev values 14 //------------------------------------------------------------------------------ 15 template <int Q_1D> 16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 17 chebyshev_x[0] = 1.0; 18 chebyshev_x[1] = 2 * x; 19 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 20 } 21 22 template <int Q_1D> 23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 24 CeedScalar chebyshev_x[3]; 25 26 chebyshev_x[1] = 1.0; 27 chebyshev_x[2] = 2 * x; 28 chebyshev_dx[0] = 0.0; 29 chebyshev_dx[1] = 2.0; 30 for (CeedInt i = 2; i < Q_1D; i++) { 31 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 32 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 33 } 34 } 35 36 //------------------------------------------------------------------------------ 37 // 1D 38 //------------------------------------------------------------------------------ 39 40 //------------------------------------------------------------------------------ 41 // 1D interpolate to points 42 //------------------------------------------------------------------------------ 43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 44 inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 45 CeedScalar *__restrict__ r_V) { 46 CeedScalar chebyshev_x[Q_1D]; 47 48 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 49 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 50 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 51 // Load coefficients 52 __syncthreads(); 53 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 54 __syncthreads(); 55 // Contract x direction 56 for (CeedInt i = 0; i < Q_1D; i++) { 57 r_V[comp] += chebyshev_x[i] * data.slice[i]; 58 } 59 } 60 } 61 62 //------------------------------------------------------------------------------ 63 // 1D interpolate transpose 64 //------------------------------------------------------------------------------ 65 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 66 inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 67 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 68 CeedScalar chebyshev_x[Q_1D]; 69 70 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 71 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 72 // Clear shared memory 73 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 74 __syncthreads(); 75 // Contract x direction 76 if (p < NUM_POINTS) { 77 for (CeedInt i = 0; i < Q_1D; i++) { 78 atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 79 } 80 } 81 // Pull from shared to register 82 __syncthreads(); 83 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 84 } 85 } 86 87 //------------------------------------------------------------------------------ 88 // 1D derivatives at points 89 //------------------------------------------------------------------------------ 90 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 91 inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 92 CeedScalar *__restrict__ r_V) { 93 CeedScalar chebyshev_x[Q_1D]; 94 95 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 96 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 97 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 98 __syncthreads(); 99 // Load coefficients 100 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 101 __syncthreads(); 102 // Contract x direction 103 for (CeedInt i = 0; i < Q_1D; i++) { 104 r_V[comp] += chebyshev_x[i] * data.slice[i]; 105 } 106 } 107 } 108 109 //------------------------------------------------------------------------------ 110 // 1D derivatives transpose 111 //------------------------------------------------------------------------------ 112 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 113 inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 114 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 115 CeedScalar chebyshev_x[Q_1D]; 116 117 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 118 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 119 // Clear shared memory 120 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 121 __syncthreads(); 122 // Contract x direction 123 if (p < NUM_POINTS) { 124 for (CeedInt i = 0; i < Q_1D; i++) { 125 atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 126 } 127 } 128 // Pull from shared to register 129 __syncthreads(); 130 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 131 } 132 } 133 134 //------------------------------------------------------------------------------ 135 // 2D 136 //------------------------------------------------------------------------------ 137 138 //------------------------------------------------------------------------------ 139 // 2D interpolate to points 140 //------------------------------------------------------------------------------ 141 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 142 inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 143 CeedScalar *__restrict__ r_V) { 144 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 145 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 146 CeedScalar buffer[Q_1D]; 147 CeedScalar chebyshev_x[Q_1D]; 148 149 // Load coefficients 150 __syncthreads(); 151 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 152 __syncthreads(); 153 // Contract x direction 154 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 155 for (CeedInt i = 0; i < Q_1D; i++) { 156 buffer[i] = 0.0; 157 for (CeedInt j = 0; j < Q_1D; j++) { 158 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 159 } 160 } 161 // Contract y direction 162 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 163 for (CeedInt i = 0; i < Q_1D; i++) { 164 r_V[comp] += chebyshev_x[i] * buffer[i]; 165 } 166 } 167 } 168 169 //------------------------------------------------------------------------------ 170 // 2D interpolate transpose 171 //------------------------------------------------------------------------------ 172 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 173 inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 174 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 175 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 176 CeedScalar buffer[Q_1D]; 177 CeedScalar chebyshev_x[Q_1D]; 178 179 // Clear shared memory 180 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 181 __syncthreads(); 182 // Contract y direction 183 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 184 for (CeedInt i = 0; i < Q_1D; i++) { 185 buffer[i] = chebyshev_x[i] * r_U[comp]; 186 } 187 // Contract x direction 188 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 189 if (p < NUM_POINTS) { 190 for (CeedInt i = 0; i < Q_1D; i++) { 191 // Note: shifting to avoid atomic adds 192 const CeedInt ii = (i + data.t_id_x) % Q_1D; 193 194 for (CeedInt j = 0; j < Q_1D; j++) { 195 const CeedInt jj = (j + data.t_id_y) % Q_1D; 196 197 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 198 } 199 } 200 } 201 // Pull from shared to register 202 __syncthreads(); 203 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 204 } 205 } 206 207 //------------------------------------------------------------------------------ 208 // 2D derivatives at points 209 //------------------------------------------------------------------------------ 210 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 211 inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 212 CeedScalar *__restrict__ r_V) { 213 for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; 214 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 215 CeedScalar buffer[Q_1D]; 216 CeedScalar chebyshev_x[Q_1D]; 217 218 // Load coefficients 219 __syncthreads(); 220 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 221 __syncthreads(); 222 for (CeedInt dim = 0; dim < 2; dim++) { 223 // Contract x direction 224 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 225 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 226 for (CeedInt i = 0; i < Q_1D; i++) { 227 buffer[i] = 0.0; 228 for (CeedInt j = 0; j < Q_1D; j++) { 229 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 230 } 231 } 232 // Contract y direction 233 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 234 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 235 for (CeedInt i = 0; i < Q_1D; i++) { 236 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; 237 } 238 } 239 } 240 } 241 242 //------------------------------------------------------------------------------ 243 // 2D derivatives transpose 244 //------------------------------------------------------------------------------ 245 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 246 inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 247 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 248 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 249 CeedScalar buffer[Q_1D]; 250 CeedScalar chebyshev_x[Q_1D]; 251 252 // Clear shared memory 253 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 254 __syncthreads(); 255 for (CeedInt dim = 0; dim < 2; dim++) { 256 // Contract y direction 257 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 258 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 259 for (CeedInt i = 0; i < Q_1D; i++) { 260 buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP]; 261 } 262 // Contract x direction 263 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 264 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 265 if (p < NUM_POINTS) { 266 for (CeedInt i = 0; i < Q_1D; i++) { 267 // Note: shifting to avoid atomic adds 268 const CeedInt ii = (i + data.t_id_x) % Q_1D; 269 270 for (CeedInt j = 0; j < Q_1D; j++) { 271 const CeedInt jj = (j + data.t_id_y) % Q_1D; 272 273 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 274 } 275 } 276 } 277 } 278 // Pull from shared to register 279 __syncthreads(); 280 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 281 } 282 } 283 284 //------------------------------------------------------------------------------ 285 // 3D 286 //------------------------------------------------------------------------------ 287 288 //------------------------------------------------------------------------------ 289 // 3D interpolate to points 290 //------------------------------------------------------------------------------ 291 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 292 inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 293 CeedScalar *__restrict__ r_V) { 294 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 295 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 296 for (CeedInt k = 0; k < Q_1D; k++) { 297 CeedScalar buffer[Q_1D]; 298 CeedScalar chebyshev_x[Q_1D]; 299 300 // Load coefficients 301 __syncthreads(); 302 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 303 __syncthreads(); 304 // Contract x direction 305 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 306 for (CeedInt i = 0; i < Q_1D; i++) { 307 buffer[i] = 0.0; 308 for (CeedInt j = 0; j < Q_1D; j++) { 309 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 310 } 311 } 312 // Contract y and z direction 313 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 314 const CeedScalar z = chebyshev_x[k]; 315 316 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 317 for (CeedInt i = 0; i < Q_1D; i++) { 318 r_V[comp] += chebyshev_x[i] * buffer[i] * z; 319 } 320 } 321 } 322 } 323 324 //------------------------------------------------------------------------------ 325 // 3D interpolate transpose 326 //------------------------------------------------------------------------------ 327 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 328 inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 329 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 330 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 331 for (CeedInt k = 0; k < Q_1D; k++) { 332 CeedScalar buffer[Q_1D]; 333 CeedScalar chebyshev_x[Q_1D]; 334 335 // Clear shared memory 336 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 337 __syncthreads(); 338 // Contract y and z direction 339 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 340 const CeedScalar z = chebyshev_x[k]; 341 342 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 343 for (CeedInt i = 0; i < Q_1D; i++) { 344 buffer[i] = chebyshev_x[i] * r_U[comp] * z; 345 } 346 // Contract x direction 347 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 348 if (p < NUM_POINTS) { 349 for (CeedInt i = 0; i < Q_1D; i++) { 350 // Note: shifting to avoid atomic adds 351 const CeedInt ii = (i + data.t_id_x) % Q_1D; 352 353 for (CeedInt j = 0; j < Q_1D; j++) { 354 const CeedInt jj = (j + data.t_id_y) % Q_1D; 355 356 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 357 } 358 } 359 } 360 // Pull from shared to register 361 __syncthreads(); 362 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 363 } 364 } 365 } 366 367 //------------------------------------------------------------------------------ 368 // 3D derivatives at points 369 //------------------------------------------------------------------------------ 370 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 371 inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 372 CeedScalar *__restrict__ r_V) { 373 for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; 374 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 375 for (CeedInt k = 0; k < Q_1D; k++) { 376 CeedScalar buffer[Q_1D]; 377 CeedScalar chebyshev_x[Q_1D]; 378 379 // Load coefficients 380 __syncthreads(); 381 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 382 __syncthreads(); 383 for (CeedInt dim = 0; dim < 3; dim++) { 384 // Contract x direction 385 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 386 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 387 for (CeedInt i = 0; i < Q_1D; i++) { 388 buffer[i] = 0.0; 389 for (CeedInt j = 0; j < Q_1D; j++) { 390 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 391 } 392 } 393 // Contract y and z direction 394 if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 395 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 396 const CeedScalar z = chebyshev_x[k]; 397 398 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 399 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 400 for (CeedInt i = 0; i < Q_1D; i++) { 401 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z; 402 } 403 } 404 } 405 } 406 } 407 408 //------------------------------------------------------------------------------ 409 // 3D derivatives transpose 410 //------------------------------------------------------------------------------ 411 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 412 inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 413 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 414 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 415 for (CeedInt k = 0; k < Q_1D; k++) { 416 CeedScalar buffer[Q_1D]; 417 CeedScalar chebyshev_x[Q_1D]; 418 419 // Clear shared memory 420 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 421 __syncthreads(); 422 for (CeedInt dim = 0; dim < 3; dim++) { 423 // Contract y and z direction 424 if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 425 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 426 const CeedScalar z = chebyshev_x[k]; 427 428 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 429 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 430 for (CeedInt i = 0; i < Q_1D; i++) { 431 buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z; 432 } 433 // Contract x direction 434 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 435 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 436 if (p < NUM_POINTS) { 437 for (CeedInt i = 0; i < Q_1D; i++) { 438 // Note: shifting to avoid atomic adds 439 const CeedInt ii = (i + data.t_id_x) % Q_1D; 440 441 for (CeedInt j = 0; j < Q_1D; j++) { 442 const CeedInt jj = (j + data.t_id_y) % Q_1D; 443 444 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 445 } 446 } 447 } 448 } 449 // Pull from shared to register 450 __syncthreads(); 451 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 452 } 453 } 454 } 455