1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis AtPoints templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Chebyshev values 14 //------------------------------------------------------------------------------ 15 template <int Q_1D> 16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 17 chebyshev_x[0] = 1.0; 18 chebyshev_x[1] = 2 * x; 19 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 20 } 21 22 template <int Q_1D> 23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 24 CeedScalar chebyshev_x[3]; 25 26 chebyshev_x[1] = 1.0; 27 chebyshev_x[2] = 2 * x; 28 chebyshev_dx[0] = 0.0; 29 chebyshev_dx[1] = 2.0; 30 for (CeedInt i = 2; i < Q_1D; i++) { 31 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 32 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 33 } 34 } 35 36 //------------------------------------------------------------------------------ 37 // 1D 38 //------------------------------------------------------------------------------ 39 40 //------------------------------------------------------------------------------ 41 // 1D interpolate to points 42 //------------------------------------------------------------------------------ 43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 44 inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 45 CeedScalar *__restrict__ r_V) { 46 CeedScalar chebyshev_x[Q_1D]; 47 48 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 49 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 50 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 51 // Load coefficients 52 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 53 __syncthreads(); 54 // Contract x direction 55 for (CeedInt i = 0; i < Q_1D; i++) { 56 r_V[comp] += chebyshev_x[i] * data.slice[i]; 57 } 58 } 59 } 60 61 //------------------------------------------------------------------------------ 62 // 1D interpolate transpose 63 //------------------------------------------------------------------------------ 64 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 65 inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 66 CeedScalar *__restrict__ r_C) { 67 CeedScalar chebyshev_x[Q_1D]; 68 69 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 70 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 71 // Clear shared memory 72 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 73 __syncthreads(); 74 // Contract x direction 75 if (p < NUM_POINTS) { 76 for (CeedInt i = 0; i < Q_1D; i++) { 77 atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 78 } 79 } 80 // Pull from shared to register 81 __syncthreads(); 82 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 83 } 84 } 85 86 //------------------------------------------------------------------------------ 87 // 1D derivatives at points 88 //------------------------------------------------------------------------------ 89 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 90 inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 91 CeedScalar *__restrict__ r_V) { 92 CeedScalar chebyshev_x[Q_1D]; 93 94 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 95 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 96 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 97 // Load coefficients 98 __syncthreads(); 99 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 100 __syncthreads(); 101 // Contract x direction 102 for (CeedInt i = 0; i < Q_1D; i++) { 103 r_V[comp] += chebyshev_x[i] * data.slice[i]; 104 } 105 } 106 } 107 108 //------------------------------------------------------------------------------ 109 // 1D derivatives transpose 110 //------------------------------------------------------------------------------ 111 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 112 inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 113 CeedScalar *__restrict__ r_C) { 114 CeedScalar chebyshev_x[Q_1D]; 115 116 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 117 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 118 // Clear shared memory 119 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 120 __syncthreads(); 121 // Contract x direction 122 if (p < NUM_POINTS) { 123 for (CeedInt i = 0; i < Q_1D; i++) { 124 atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 125 } 126 } 127 // Pull from shared to register 128 __syncthreads(); 129 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 130 } 131 } 132 133 //------------------------------------------------------------------------------ 134 // 2D 135 //------------------------------------------------------------------------------ 136 137 //------------------------------------------------------------------------------ 138 // 2D interpolate to points 139 //------------------------------------------------------------------------------ 140 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 141 inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 142 CeedScalar *__restrict__ r_V) { 143 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 144 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 145 CeedScalar buffer[Q_1D]; 146 CeedScalar chebyshev_x[Q_1D]; 147 148 // Load coefficients 149 __syncthreads(); 150 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 151 __syncthreads(); 152 // Contract x direction 153 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 154 for (CeedInt i = 0; i < Q_1D; i++) { 155 buffer[i] = 0.0; 156 for (CeedInt j = 0; j < Q_1D; j++) { 157 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 158 } 159 } 160 // Contract y direction 161 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 162 for (CeedInt i = 0; i < Q_1D; i++) { 163 r_V[comp] += chebyshev_x[i] * buffer[i]; 164 } 165 } 166 } 167 168 //------------------------------------------------------------------------------ 169 // 2D interpolate transpose 170 //------------------------------------------------------------------------------ 171 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 172 inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 173 CeedScalar *__restrict__ r_C) { 174 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 175 CeedScalar buffer[Q_1D]; 176 CeedScalar chebyshev_x[Q_1D]; 177 178 // Clear shared memory 179 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 180 __syncthreads(); 181 // Contract y direction 182 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 183 const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0; 184 185 for (CeedInt i = 0; i < Q_1D; i++) { 186 buffer[i] = chebyshev_x[i] * r_u; 187 } 188 // Contract x direction 189 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 190 for (CeedInt i = 0; i < Q_1D; i++) { 191 // Note: shifting to avoid atomic adds 192 const CeedInt ii = (i + data.t_id_y) % Q_1D; 193 194 for (CeedInt j = 0; j < Q_1D; j++) { 195 const CeedInt jj = (j + data.t_id_x) % Q_1D; 196 197 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 198 } 199 } 200 // Pull from shared to register 201 __syncthreads(); 202 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 203 } 204 } 205 206 //------------------------------------------------------------------------------ 207 // 2D derivatives at points 208 //------------------------------------------------------------------------------ 209 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 210 inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 211 CeedScalar *__restrict__ r_V) { 212 for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; 213 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 214 CeedScalar buffer[Q_1D]; 215 CeedScalar chebyshev_x[Q_1D]; 216 217 // Load coefficients 218 __syncthreads(); 219 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 220 __syncthreads(); 221 for (CeedInt dim = 0; dim < 2; dim++) { 222 // Contract x direction 223 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 224 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 225 for (CeedInt i = 0; i < Q_1D; i++) { 226 buffer[i] = 0.0; 227 for (CeedInt j = 0; j < Q_1D; j++) { 228 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 229 } 230 } 231 // Contract y direction 232 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 233 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 234 for (CeedInt i = 0; i < Q_1D; i++) { 235 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; 236 } 237 } 238 } 239 } 240 241 //------------------------------------------------------------------------------ 242 // 2D derivatives transpose 243 //------------------------------------------------------------------------------ 244 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 245 inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 246 CeedScalar *__restrict__ r_C) { 247 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 248 CeedScalar buffer[Q_1D]; 249 CeedScalar chebyshev_x[Q_1D]; 250 251 // Clear shared memory 252 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 253 __syncthreads(); 254 for (CeedInt dim = 0; dim < 2; dim++) { 255 // Contract y direction 256 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 257 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 258 const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0; 259 260 for (CeedInt i = 0; i < Q_1D; i++) { 261 buffer[i] = chebyshev_x[i] * r_u; 262 } 263 // Contract x direction 264 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 265 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 266 for (CeedInt i = 0; i < Q_1D; i++) { 267 // Note: shifting to avoid atomic adds 268 const CeedInt ii = (i + data.t_id_y) % Q_1D; 269 270 for (CeedInt j = 0; j < Q_1D; j++) { 271 const CeedInt jj = (j + data.t_id_x) % Q_1D; 272 273 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 274 } 275 } 276 } 277 // Pull from shared to register 278 __syncthreads(); 279 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 280 } 281 } 282 283 //------------------------------------------------------------------------------ 284 // 3D 285 //------------------------------------------------------------------------------ 286 287 //------------------------------------------------------------------------------ 288 // 3D interpolate to points 289 //------------------------------------------------------------------------------ 290 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 291 inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 292 CeedScalar *__restrict__ r_V) { 293 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 294 for (CeedInt k = 0; k < Q_1D; k++) { 295 CeedScalar buffer[Q_1D]; 296 CeedScalar chebyshev_x[Q_1D]; 297 298 // Get z contraction value 299 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 300 const CeedScalar z = chebyshev_x[k]; 301 302 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 303 // Load coefficients 304 __syncthreads(); 305 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 306 __syncthreads(); 307 // Contract x direction 308 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 309 for (CeedInt i = 0; i < Q_1D; i++) { 310 buffer[i] = 0.0; 311 for (CeedInt j = 0; j < Q_1D; j++) { 312 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 313 } 314 } 315 // Contract y and z direction 316 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 317 for (CeedInt i = 0; i < Q_1D; i++) { 318 r_V[comp] += chebyshev_x[i] * buffer[i] * z; 319 } 320 } 321 } 322 } 323 324 //------------------------------------------------------------------------------ 325 // 3D interpolate transpose 326 //------------------------------------------------------------------------------ 327 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 328 inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 329 CeedScalar *__restrict__ r_C) { 330 for (CeedInt k = 0; k < Q_1D; k++) { 331 CeedScalar buffer[Q_1D]; 332 CeedScalar chebyshev_x[Q_1D]; 333 334 // Get z contraction value 335 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 336 const CeedScalar z = chebyshev_x[k]; 337 338 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 339 // Clear shared memory 340 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 341 __syncthreads(); 342 // Contract y and z direction 343 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 344 const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0; 345 346 for (CeedInt i = 0; i < Q_1D; i++) { 347 buffer[i] = chebyshev_x[i] * r_u * z; 348 } 349 // Contract x direction 350 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 351 for (CeedInt i = 0; i < Q_1D; i++) { 352 // Note: shifting to avoid atomic adds 353 const CeedInt ii = (i + data.t_id_y) % Q_1D; 354 355 for (CeedInt j = 0; j < Q_1D; j++) { 356 const CeedInt jj = (j + data.t_id_x) % Q_1D; 357 358 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 359 } 360 } 361 // Pull from shared to register 362 __syncthreads(); 363 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 364 } 365 } 366 } 367 368 //------------------------------------------------------------------------------ 369 // 3D derivatives at points 370 //------------------------------------------------------------------------------ 371 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 372 inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, 373 CeedScalar *__restrict__ r_V) { 374 for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; 375 for (CeedInt k = 0; k < Q_1D; k++) { 376 CeedScalar buffer[Q_1D]; 377 CeedScalar chebyshev_x[Q_1D]; 378 379 // Get z contraction values 380 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 381 const CeedScalar z = chebyshev_x[k]; 382 383 ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 384 const CeedScalar dz = chebyshev_x[k]; 385 386 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 387 // Load coefficients 388 __syncthreads(); 389 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 390 __syncthreads(); 391 // Gradient directions 392 for (CeedInt dim = 0; dim < 3; dim++) { 393 // Contract x direction 394 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 395 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 396 for (CeedInt i = 0; i < Q_1D; i++) { 397 buffer[i] = 0.0; 398 for (CeedInt j = 0; j < Q_1D; j++) { 399 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 400 } 401 } 402 // Contract y and z direction 403 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 404 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 405 const CeedScalar zz = dim == 2 ? dz : z; 406 407 for (CeedInt i = 0; i < Q_1D; i++) { 408 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz; 409 } 410 } 411 } 412 } 413 } 414 415 //------------------------------------------------------------------------------ 416 // 3D derivatives transpose 417 //------------------------------------------------------------------------------ 418 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 419 inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, 420 CeedScalar *__restrict__ r_C) { 421 for (CeedInt k = 0; k < Q_1D; k++) { 422 CeedScalar buffer[Q_1D]; 423 CeedScalar chebyshev_x[Q_1D]; 424 425 // Get z contraction values 426 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 427 const CeedScalar z = chebyshev_x[k]; 428 429 ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 430 const CeedScalar dz = chebyshev_x[k]; 431 432 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 433 // Clear shared memory 434 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 435 __syncthreads(); 436 // Gradient directions 437 for (CeedInt dim = 0; dim < 3; dim++) { 438 // Contract y and z direction 439 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 440 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 441 const CeedScalar zz = dim == 2 ? dz : z; 442 const CeedScalar r_u = (p < NUM_POINTS) ? r_U[comp + dim * NUM_COMP] : 0.0; 443 444 for (CeedInt i = 0; i < Q_1D; i++) { 445 buffer[i] = chebyshev_x[i] * r_u * zz; 446 } 447 448 // Contract x direction 449 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 450 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 451 for (CeedInt i = 0; i < Q_1D; i++) { 452 // Note: shifting to avoid atomic adds 453 const CeedInt ii = (i + data.t_id_y) % Q_1D; 454 455 for (CeedInt j = 0; j < Q_1D; j++) { 456 const CeedInt jj = (j + data.t_id_x) % Q_1D; 457 458 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 459 } 460 } 461 } 462 // Pull from shared to register 463 __syncthreads(); 464 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 465 } 466 } 467 } 468