1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis AtPoints templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Chebyshev values 14 //------------------------------------------------------------------------------ 15 template <int Q_1D> 16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 17 chebyshev_x[0] = 1.0; 18 chebyshev_x[1] = 2 * x; 19 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 20 } 21 22 template <int Q_1D> 23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 24 CeedScalar chebyshev_x[3]; 25 26 chebyshev_x[1] = 1.0; 27 chebyshev_x[2] = 2 * x; 28 chebyshev_dx[0] = 0.0; 29 chebyshev_dx[1] = 2.0; 30 for (CeedInt i = 2; i < Q_1D; i++) { 31 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 32 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 33 } 34 } 35 36 //------------------------------------------------------------------------------ 37 // 1D 38 //------------------------------------------------------------------------------ 39 40 //------------------------------------------------------------------------------ 41 // 1D interpolate to points 42 //------------------------------------------------------------------------------ 43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 44 inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 45 CeedScalar *__restrict__ r_V) { 46 CeedScalar chebyshev_x[Q_1D]; 47 48 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 49 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 50 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 51 // Load coefficients 52 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 53 __syncthreads(); 54 // Contract x direction 55 for (CeedInt i = 0; i < Q_1D; i++) { 56 r_V[comp] += chebyshev_x[i] * data.slice[i]; 57 } 58 } 59 } 60 61 //------------------------------------------------------------------------------ 62 // 1D interpolate transpose 63 //------------------------------------------------------------------------------ 64 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 65 inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 66 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 67 CeedScalar chebyshev_x[Q_1D]; 68 69 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 70 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 71 // Clear shared memory 72 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 73 __syncthreads(); 74 // Contract x direction 75 if (p < NUM_POINTS) { 76 for (CeedInt i = 0; i < Q_1D; i++) { 77 atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 78 } 79 } 80 // Pull from shared to register 81 __syncthreads(); 82 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 83 } 84 } 85 86 //------------------------------------------------------------------------------ 87 // 1D derivatives at points 88 //------------------------------------------------------------------------------ 89 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 90 inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 91 CeedScalar *__restrict__ r_V) { 92 CeedScalar chebyshev_x[Q_1D]; 93 94 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 95 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 96 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 97 // Load coefficients 98 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 99 __syncthreads(); 100 // Contract x direction 101 for (CeedInt i = 0; i < Q_1D; i++) { 102 r_V[comp] += chebyshev_x[i] * data.slice[i]; 103 } 104 } 105 } 106 107 //------------------------------------------------------------------------------ 108 // 1D derivatives transpose 109 //------------------------------------------------------------------------------ 110 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 111 inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 112 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 113 CeedScalar chebyshev_x[Q_1D]; 114 115 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 116 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 117 // Clear shared memory 118 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 119 __syncthreads(); 120 // Contract x direction 121 if (p < NUM_POINTS) { 122 for (CeedInt i = 0; i < Q_1D; i++) { 123 atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 124 } 125 } 126 // Pull from shared to register 127 __syncthreads(); 128 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 129 } 130 } 131 132 //------------------------------------------------------------------------------ 133 // 2D 134 //------------------------------------------------------------------------------ 135 136 //------------------------------------------------------------------------------ 137 // 2D interpolate to points 138 //------------------------------------------------------------------------------ 139 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 140 inline __device__ void InterpAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p, 141 const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_V) { 142 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 143 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 144 CeedScalar buffer[Q_1D]; 145 CeedScalar chebyshev_x[Q_1D]; 146 147 // Load coefficients 148 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp]; 149 __syncthreads(); 150 // Contract x direction 151 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 152 for (CeedInt i = 0; i < Q_1D; i++) { 153 buffer[i] = 0.0; 154 for (CeedInt j = 0; j < Q_1D; j++) { 155 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 156 } 157 } 158 // Contract y direction 159 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 160 for (CeedInt i = 0; i < Q_1D; i++) { 161 r_V[comp] += chebyshev_x[i] * buffer[i]; 162 } 163 } 164 } 165 166 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 167 inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 168 CeedScalar *__restrict__ r_V) { 169 InterpAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V); 170 } 171 172 //------------------------------------------------------------------------------ 173 // 2D interpolate transpose 174 //------------------------------------------------------------------------------ 175 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 176 inline __device__ void InterpTransposeAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p, 177 const CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ r_X, 178 CeedScalar *__restrict__ r_C) { 179 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 180 CeedScalar buffer[Q_1D]; 181 CeedScalar chebyshev_x[Q_1D]; 182 183 // Clear shared memory 184 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0; 185 __syncthreads(); 186 // Contract y direction 187 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 188 for (CeedInt i = 0; i < Q_1D; i++) { 189 buffer[i] = chebyshev_x[i] * r_U[comp]; 190 } 191 // Contract x direction 192 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 193 if (p < NUM_POINTS) { 194 for (CeedInt i = 0; i < Q_1D; i++) { 195 // Note: shifting to avoid atomic adds 196 const CeedInt ii = (i + t_id_x) % Q_1D; 197 198 for (CeedInt j = 0; j < Q_1D; j++) { 199 const CeedInt jj = (j + t_id_y) % Q_1D; 200 201 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 202 } 203 } 204 } 205 // Pull from shared to register 206 __syncthreads(); 207 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 208 } 209 } 210 211 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 212 inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 213 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 214 InterpTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C); 215 } 216 217 //------------------------------------------------------------------------------ 218 // 2D derivatives at points 219 //------------------------------------------------------------------------------ 220 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 221 inline __device__ void GradAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p, 222 const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_V) { 223 for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; 224 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 225 CeedScalar buffer[Q_1D]; 226 CeedScalar chebyshev_x[Q_1D]; 227 228 // Load coefficients 229 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = r_C[comp]; 230 __syncthreads(); 231 for (CeedInt dim = 0; dim < 2; dim++) { 232 // Contract x direction 233 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 234 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 235 for (CeedInt i = 0; i < Q_1D; i++) { 236 buffer[i] = 0.0; 237 for (CeedInt j = 0; j < Q_1D; j++) { 238 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 239 } 240 } 241 // Contract y direction 242 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 243 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 244 for (CeedInt i = 0; i < Q_1D; i++) { 245 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; 246 } 247 } 248 } 249 } 250 251 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 252 inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 253 CeedScalar *__restrict__ r_V) { 254 GradAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_C, r_X, r_V); 255 } 256 257 //------------------------------------------------------------------------------ 258 // 2D derivatives transpose 259 //------------------------------------------------------------------------------ 260 template <int NUM_COMP, int NUM_POINTS, int Q_1D> 261 inline __device__ void GradTransposeAtPoints2d_Core(SharedData_Hip &data, const int t_id_x, const int t_id_y, const CeedInt p, 262 const CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ r_X, 263 CeedScalar *__restrict__ r_C) { 264 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 265 CeedScalar buffer[Q_1D]; 266 CeedScalar chebyshev_x[Q_1D]; 267 268 // Clear shared memory 269 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * Q_1D] = 0.0; 270 __syncthreads(); 271 for (CeedInt dim = 0; dim < 2; dim++) { 272 // Contract y direction 273 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 274 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 275 for (CeedInt i = 0; i < Q_1D; i++) { 276 buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP]; 277 } 278 // Contract x direction 279 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 280 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 281 if (p < NUM_POINTS) { 282 for (CeedInt i = 0; i < Q_1D; i++) { 283 // Note: shifting to avoid atomic adds 284 const CeedInt ii = (i + t_id_x) % Q_1D; 285 286 for (CeedInt j = 0; j < Q_1D; j++) { 287 const CeedInt jj = (j + t_id_y) % Q_1D; 288 289 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 290 } 291 } 292 } 293 } 294 // Pull from shared to register 295 __syncthreads(); 296 if (t_id_x < Q_1D && t_id_y < Q_1D) r_C[comp] += data.slice[t_id_x + t_id_y * Q_1D]; 297 } 298 } 299 300 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 301 inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 302 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 303 GradTransposeAtPoints2d_Core<NUM_COMP, NUM_POINTS, Q_1D>(data, data.t_id_x, data.t_id_y, p, r_U, r_X, r_C); 304 } 305 306 //------------------------------------------------------------------------------ 307 // 3D 308 //------------------------------------------------------------------------------ 309 310 //------------------------------------------------------------------------------ 311 // 3D interpolate to points 312 //------------------------------------------------------------------------------ 313 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 314 inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 315 CeedScalar *__restrict__ r_V) { 316 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 317 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 318 for (CeedInt k = 0; k < Q_1D; k++) { 319 CeedScalar buffer[Q_1D]; 320 CeedScalar chebyshev_x[Q_1D]; 321 322 // Load coefficients 323 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 324 __syncthreads(); 325 // Contract x direction 326 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 327 for (CeedInt i = 0; i < Q_1D; i++) { 328 buffer[i] = 0.0; 329 for (CeedInt j = 0; j < Q_1D; j++) { 330 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 331 } 332 } 333 // Contract y and z direction 334 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 335 const CeedScalar z = chebyshev_x[k]; 336 337 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 338 for (CeedInt i = 0; i < Q_1D; i++) { 339 r_V[comp] += chebyshev_x[i] * buffer[i] * z; 340 } 341 } 342 } 343 } 344 345 //------------------------------------------------------------------------------ 346 // 3D interpolate transpose 347 //------------------------------------------------------------------------------ 348 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 349 inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 350 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 351 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 352 for (CeedInt k = 0; k < Q_1D; k++) { 353 CeedScalar buffer[Q_1D]; 354 CeedScalar chebyshev_x[Q_1D]; 355 356 // Clear shared memory 357 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 358 __syncthreads(); 359 // Contract y and z direction 360 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 361 const CeedScalar z = chebyshev_x[k]; 362 363 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 364 for (CeedInt i = 0; i < Q_1D; i++) { 365 buffer[i] = chebyshev_x[i] * r_U[comp] * z; 366 } 367 // Contract x direction 368 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 369 if (p < NUM_POINTS) { 370 for (CeedInt i = 0; i < Q_1D; i++) { 371 // Note: shifting to avoid atomic adds 372 const CeedInt ii = (i + data.t_id_x) % Q_1D; 373 374 for (CeedInt j = 0; j < Q_1D; j++) { 375 const CeedInt jj = (j + data.t_id_y) % Q_1D; 376 377 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 378 } 379 } 380 } 381 // Pull from shared to register 382 __syncthreads(); 383 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 384 } 385 } 386 } 387 388 //------------------------------------------------------------------------------ 389 // 3D derivatives at points 390 //------------------------------------------------------------------------------ 391 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 392 inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 393 CeedScalar *__restrict__ r_V) { 394 for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; 395 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 396 for (CeedInt k = 0; k < Q_1D; k++) { 397 CeedScalar buffer[Q_1D]; 398 CeedScalar chebyshev_x[Q_1D]; 399 400 // Load coefficients 401 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 402 __syncthreads(); 403 for (CeedInt dim = 0; dim < 3; dim++) { 404 // Contract x direction 405 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 406 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 407 for (CeedInt i = 0; i < Q_1D; i++) { 408 buffer[i] = 0.0; 409 for (CeedInt j = 0; j < Q_1D; j++) { 410 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 411 } 412 } 413 // Contract y and z direction 414 if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 415 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 416 const CeedScalar z = chebyshev_x[k]; 417 418 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 419 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 420 for (CeedInt i = 0; i < Q_1D; i++) { 421 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z; 422 } 423 } 424 } 425 } 426 } 427 428 //------------------------------------------------------------------------------ 429 // 3D derivatives transpose 430 //------------------------------------------------------------------------------ 431 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 432 inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 433 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 434 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 435 for (CeedInt k = 0; k < Q_1D; k++) { 436 CeedScalar buffer[Q_1D]; 437 CeedScalar chebyshev_x[Q_1D]; 438 439 // Clear shared memory 440 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 441 __syncthreads(); 442 for (CeedInt dim = 0; dim < 3; dim++) { 443 // Contract y and z direction 444 if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 445 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 446 const CeedScalar z = chebyshev_x[k]; 447 448 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 449 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 450 for (CeedInt i = 0; i < Q_1D; i++) { 451 buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z; 452 } 453 // Contract x direction 454 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 455 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 456 if (p < NUM_POINTS) { 457 for (CeedInt i = 0; i < Q_1D; i++) { 458 // Note: shifting to avoid atomic adds 459 const CeedInt ii = (i + data.t_id_x) % Q_1D; 460 461 for (CeedInt j = 0; j < Q_1D; j++) { 462 const CeedInt jj = (j + data.t_id_y) % Q_1D; 463 464 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 465 } 466 } 467 } 468 } 469 // Pull from shared to register 470 __syncthreads(); 471 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 472 } 473 } 474 } 475