1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis AtPoints templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Chebyshev values 14 //------------------------------------------------------------------------------ 15 template <int Q_1D> 16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 17 chebyshev_x[0] = 1.0; 18 chebyshev_x[1] = 2 * x; 19 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 20 } 21 22 template <int Q_1D> 23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 24 CeedScalar chebyshev_x[3]; 25 26 chebyshev_x[1] = 1.0; 27 chebyshev_x[2] = 2 * x; 28 chebyshev_dx[0] = 0.0; 29 chebyshev_dx[1] = 2.0; 30 for (CeedInt i = 2; i < Q_1D; i++) { 31 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 32 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 33 } 34 } 35 36 //------------------------------------------------------------------------------ 37 // 1D 38 //------------------------------------------------------------------------------ 39 40 //------------------------------------------------------------------------------ 41 // 1D interpolate to points 42 //------------------------------------------------------------------------------ 43 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 44 inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 45 CeedScalar *__restrict__ r_V) { 46 CeedScalar chebyshev_x[Q_1D]; 47 48 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 49 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 50 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 51 // Load coefficients 52 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 53 __syncthreads(); 54 // Contract x direction 55 for (CeedInt i = 0; i < Q_1D; i++) { 56 r_V[comp] += chebyshev_x[i] * data.slice[i]; 57 } 58 } 59 } 60 61 //------------------------------------------------------------------------------ 62 // 1D interpolate transpose 63 //------------------------------------------------------------------------------ 64 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 65 inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 66 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 67 CeedScalar chebyshev_x[Q_1D]; 68 69 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 70 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 71 // Clear shared memory 72 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 73 __syncthreads(); 74 // Contract x direction 75 if (p < NUM_POINTS) { 76 for (CeedInt i = 0; i < Q_1D; i++) { 77 atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); 78 } 79 } 80 // Pull from shared to register 81 __syncthreads(); 82 if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p]; 83 } 84 } 85 86 //------------------------------------------------------------------------------ 87 // 1D derivatives at points 88 //------------------------------------------------------------------------------ 89 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 90 inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 91 CeedScalar *__restrict__ r_V) { 92 CeedScalar chebyshev_x[Q_1D]; 93 94 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 95 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 96 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 97 // Load coefficients 98 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 99 __syncthreads(); 100 // Contract x direction 101 for (CeedInt i = 0; i < Q_1D; i++) { 102 r_V[comp] += chebyshev_x[i] * data.slice[i]; 103 } 104 } 105 } 106 107 //------------------------------------------------------------------------------ 108 // 1D derivatives transpose 109 //------------------------------------------------------------------------------ 110 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 111 inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 112 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 113 CeedScalar chebyshev_x[Q_1D]; 114 115 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 116 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 117 // Clear shared memory 118 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 119 __syncthreads(); 120 // Contract x direction 121 if (p < NUM_POINTS) { 122 for (CeedInt i = 0; i < Q_1D; i++) { 123 atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); 124 } 125 } 126 // Pull from shared to register 127 __syncthreads(); 128 if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p]; 129 } 130 } 131 132 //------------------------------------------------------------------------------ 133 // 2D 134 //------------------------------------------------------------------------------ 135 136 //------------------------------------------------------------------------------ 137 // 2D interpolate to points 138 //------------------------------------------------------------------------------ 139 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 140 inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 141 CeedScalar *__restrict__ r_V) { 142 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 143 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 144 CeedScalar buffer[Q_1D]; 145 CeedScalar chebyshev_x[Q_1D]; 146 147 // Load coefficients 148 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 149 __syncthreads(); 150 // Contract x direction 151 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 152 for (CeedInt i = 0; i < Q_1D; i++) { 153 buffer[i] = 0.0; 154 for (CeedInt j = 0; j < Q_1D; j++) { 155 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 156 } 157 } 158 // Contract y direction 159 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 160 for (CeedInt i = 0; i < Q_1D; i++) { 161 r_V[comp] += chebyshev_x[i] * buffer[i]; 162 } 163 } 164 } 165 166 //------------------------------------------------------------------------------ 167 // 2D interpolate transpose 168 //------------------------------------------------------------------------------ 169 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 170 inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 171 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 172 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 173 CeedScalar buffer[Q_1D]; 174 CeedScalar chebyshev_x[Q_1D]; 175 176 // Clear shared memory 177 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 178 __syncthreads(); 179 // Contract y direction 180 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 181 for (CeedInt i = 0; i < Q_1D; i++) { 182 buffer[i] = chebyshev_x[i] * r_U[comp]; 183 } 184 // Contract x direction 185 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 186 if (p < NUM_POINTS) { 187 for (CeedInt i = 0; i < Q_1D; i++) { 188 // Note: shifting to avoid atomic adds 189 const CeedInt ii = (i + (p / Q_1D)) % Q_1D; 190 191 for (CeedInt j = 0; j < Q_1D; j++) { 192 const CeedInt jj = (j + p) % Q_1D; 193 194 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 195 } 196 } 197 } 198 // Pull from shared to register 199 __syncthreads(); 200 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 201 } 202 } 203 204 //------------------------------------------------------------------------------ 205 // 2D derivatives at points 206 //------------------------------------------------------------------------------ 207 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 208 inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 209 CeedScalar *__restrict__ r_V) { 210 for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; 211 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 212 CeedScalar buffer[Q_1D]; 213 CeedScalar chebyshev_x[Q_1D]; 214 215 // Load coefficients 216 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 217 __syncthreads(); 218 for (CeedInt dim = 0; dim < 2; dim++) { 219 // Contract x direction 220 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 221 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 222 for (CeedInt i = 0; i < Q_1D; i++) { 223 buffer[i] = 0.0; 224 for (CeedInt j = 0; j < Q_1D; j++) { 225 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 226 } 227 } 228 // Contract y direction 229 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 230 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 231 for (CeedInt i = 0; i < Q_1D; i++) { 232 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; 233 } 234 } 235 } 236 } 237 238 //------------------------------------------------------------------------------ 239 // 2D derivatives transpose 240 //------------------------------------------------------------------------------ 241 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 242 inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 243 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 244 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 245 CeedScalar buffer[Q_1D]; 246 CeedScalar chebyshev_x[Q_1D]; 247 248 // Clear shared memory 249 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 250 __syncthreads(); 251 for (CeedInt dim = 0; dim < 2; dim++) { 252 // Contract y direction 253 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 254 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 255 for (CeedInt i = 0; i < Q_1D; i++) { 256 buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP]; 257 } 258 // Contract x direction 259 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 260 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 261 if (p < NUM_POINTS) { 262 for (CeedInt i = 0; i < Q_1D; i++) { 263 // Note: shifting to avoid atomic adds 264 const CeedInt ii = (i + (p / Q_1D)) % Q_1D; 265 266 for (CeedInt j = 0; j < Q_1D; j++) { 267 const CeedInt jj = (j + p) % Q_1D; 268 269 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 270 } 271 } 272 } 273 } 274 // Pull from shared to register 275 __syncthreads(); 276 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 277 } 278 } 279 280 //------------------------------------------------------------------------------ 281 // 3D 282 //------------------------------------------------------------------------------ 283 284 //------------------------------------------------------------------------------ 285 // 3D interpolate to points 286 //------------------------------------------------------------------------------ 287 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 288 inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 289 CeedScalar *__restrict__ r_V) { 290 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 291 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 292 for (CeedInt k = 0; k < Q_1D; k++) { 293 CeedScalar buffer[Q_1D]; 294 CeedScalar chebyshev_x[Q_1D]; 295 296 // Load coefficients 297 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 298 __syncthreads(); 299 // Contract x direction 300 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 301 for (CeedInt i = 0; i < Q_1D; i++) { 302 buffer[i] = 0.0; 303 for (CeedInt j = 0; j < Q_1D; j++) { 304 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 305 } 306 } 307 // Contract y and z direction 308 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 309 const CeedScalar z = chebyshev_x[k]; 310 311 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 312 for (CeedInt i = 0; i < Q_1D; i++) { 313 r_V[comp] += chebyshev_x[i] * buffer[i] * z; 314 } 315 } 316 } 317 } 318 319 //------------------------------------------------------------------------------ 320 // 3D interpolate transpose 321 //------------------------------------------------------------------------------ 322 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 323 inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 324 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 325 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 326 for (CeedInt k = 0; k < Q_1D; k++) { 327 CeedScalar buffer[Q_1D]; 328 CeedScalar chebyshev_x[Q_1D]; 329 330 // Clear shared memory 331 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 332 __syncthreads(); 333 // Contract y and z direction 334 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 335 const CeedScalar z = chebyshev_x[k]; 336 337 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 338 for (CeedInt i = 0; i < Q_1D; i++) { 339 buffer[i] = chebyshev_x[i] * r_U[comp] * z; 340 } 341 // Contract x direction 342 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 343 if (p < NUM_POINTS) { 344 for (CeedInt i = 0; i < Q_1D; i++) { 345 // Note: shifting to avoid atomic adds 346 const CeedInt ii = (i + (p / Q_1D)) % Q_1D; 347 348 for (CeedInt j = 0; j < Q_1D; j++) { 349 const CeedInt jj = ((j + p) % Q_1D); 350 351 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 352 } 353 } 354 } 355 // Pull from shared to register 356 __syncthreads(); 357 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 358 } 359 } 360 } 361 362 //------------------------------------------------------------------------------ 363 // 3D derivatives at points 364 //------------------------------------------------------------------------------ 365 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 366 inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 367 CeedScalar *__restrict__ r_V) { 368 for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; 369 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 370 for (CeedInt k = 0; k < Q_1D; k++) { 371 CeedScalar buffer[Q_1D]; 372 CeedScalar chebyshev_x[Q_1D]; 373 374 // Load coefficients 375 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 376 __syncthreads(); 377 for (CeedInt dim = 0; dim < 3; dim++) { 378 // Contract x direction 379 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 380 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 381 for (CeedInt i = 0; i < Q_1D; i++) { 382 buffer[i] = 0.0; 383 for (CeedInt j = 0; j < Q_1D; j++) { 384 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 385 } 386 } 387 // Contract y and z direction 388 if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 389 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 390 const CeedScalar z = chebyshev_x[k]; 391 392 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 393 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 394 for (CeedInt i = 0; i < Q_1D; i++) { 395 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z; 396 } 397 } 398 } 399 } 400 } 401 402 //------------------------------------------------------------------------------ 403 // 3D derivatives transpose 404 //------------------------------------------------------------------------------ 405 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 406 inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 407 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 408 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 409 for (CeedInt k = 0; k < Q_1D; k++) { 410 CeedScalar buffer[Q_1D]; 411 CeedScalar chebyshev_x[Q_1D]; 412 413 // Clear shared memory 414 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 415 __syncthreads(); 416 for (CeedInt dim = 0; dim < 3; dim++) { 417 // Contract y and z direction 418 if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 419 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 420 const CeedScalar z = chebyshev_x[k]; 421 422 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 423 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 424 for (CeedInt i = 0; i < Q_1D; i++) { 425 buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z; 426 } 427 // Contract x direction 428 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 429 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 430 if (p < NUM_POINTS) { 431 for (CeedInt i = 0; i < Q_1D; i++) { 432 // Note: shifting to avoid atomic adds 433 const CeedInt ii = (i + (p / Q_1D)) % Q_1D; 434 435 for (CeedInt j = 0; j < Q_1D; j++) { 436 const CeedInt jj = ((j + p) % Q_1D); 437 438 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 439 } 440 } 441 } 442 } 443 // Pull from shared to register 444 __syncthreads(); 445 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 446 } 447 } 448 } 449 450 //------------------------------------------------------------------------------ 451 // Loops over points 452 //------------------------------------------------------------------------------ 453 454 //------------------------------------------------------------------------------ 455 // Interpolate to points 456 //------------------------------------------------------------------------------ 457 template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D> 458 inline __device__ void InterpAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C, 459 const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) { 460 const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); 461 462 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 463 const CeedInt p = i % NUM_PTS; 464 CeedScalar r_X[DIM]; 465 466 for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p]; 467 if (DIM == 1) { 468 InterpAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V); 469 } else if (DIM == 2) { 470 InterpAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V); 471 } else if (DIM == 3) { 472 InterpAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V); 473 } 474 if (i < NUM_PTS) { 475 for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j]; 476 } 477 } 478 } 479 480 //------------------------------------------------------------------------------ 481 // Interpolate from points 482 //------------------------------------------------------------------------------ 483 template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D> 484 inline __device__ void InterpTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem, 485 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X, 486 CeedScalar *__restrict__ r_C) { 487 // Clear register 488 for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0; 489 490 const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); 491 492 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 493 const CeedInt p = i % NUM_PTS; 494 CeedScalar r_X[DIM]; 495 496 for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p]; 497 for (CeedInt j = 0; j < NUM_COMP; j++) { 498 if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p]; 499 else r_U[j] = 0.0; 500 } 501 if (BASIS_DIM == 1) { 502 InterpTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C); 503 } else if (BASIS_DIM == 2) { 504 InterpTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C); 505 } else if (BASIS_DIM == 3) { 506 InterpTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C); 507 } 508 } 509 __syncthreads(); 510 } 511 512 //------------------------------------------------------------------------------ 513 // Gradient at points 514 //------------------------------------------------------------------------------ 515 template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D> 516 inline __device__ void GradAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C, 517 const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) { 518 const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); 519 520 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 521 const CeedInt p = i % NUM_PTS; 522 CeedScalar r_X[DIM]; 523 524 for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p]; 525 if (DIM == 1) { 526 GradAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V); 527 } else if (DIM == 2) { 528 GradAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V); 529 } else if (DIM == 3) { 530 GradAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V); 531 } 532 if (i < NUM_PTS) { 533 for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j]; 534 } 535 } 536 } 537 538 //------------------------------------------------------------------------------ 539 // Grad from points 540 //------------------------------------------------------------------------------ 541 template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D> 542 inline __device__ void GradTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem, 543 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X, 544 CeedScalar *__restrict__ r_C) { 545 // Clear register 546 for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0; 547 548 const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); 549 550 for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { 551 const CeedInt p = i % NUM_PTS; 552 CeedScalar r_X[DIM]; 553 554 for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p]; 555 for (CeedInt j = 0; j < NUM_COMP * DIM; j++) { 556 if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p]; 557 else r_U[j] = 0.0; 558 } 559 if (BASIS_DIM == 1) { 560 GradTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C); 561 } else if (BASIS_DIM == 2) { 562 GradTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C); 563 } else if (BASIS_DIM == 3) { 564 GradTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C); 565 } 566 } 567 __syncthreads(); 568 } 569