1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis AtPoints templates 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Chebyshev values 14 //------------------------------------------------------------------------------ 15 template <int Q_1D> 16 inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { 17 chebyshev_x[0] = 1.0; 18 chebyshev_x[1] = 2 * x; 19 for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; 20 } 21 22 template <int Q_1D> 23 inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { 24 CeedScalar chebyshev_x[3]; 25 26 chebyshev_x[1] = 1.0; 27 chebyshev_x[2] = 2 * x; 28 chebyshev_dx[0] = 0.0; 29 chebyshev_dx[1] = 2.0; 30 for (CeedInt i = 2; i < Q_1D; i++) { 31 chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; 32 chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; 33 } 34 } 35 36 //------------------------------------------------------------------------------ 37 // 1D 38 //------------------------------------------------------------------------------ 39 40 //------------------------------------------------------------------------------ 41 // 1D interpolate to points 42 //------------------------------------------------------------------------------ 43 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 44 inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 45 CeedScalar *__restrict__ r_V) { 46 CeedScalar chebyshev_x[Q_1D]; 47 48 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 49 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 50 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 51 // Load coefficients 52 __syncthreads(); 53 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 54 __syncthreads(); 55 // Contract x direction 56 for (CeedInt i = 0; i < Q_1D; i++) { 57 r_V[comp] += chebyshev_x[i] * data.slice[i]; 58 } 59 } 60 } 61 62 //------------------------------------------------------------------------------ 63 // 1D interpolate transpose 64 //------------------------------------------------------------------------------ 65 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 66 inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 67 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 68 CeedScalar chebyshev_x[Q_1D]; 69 70 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 71 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 72 // Clear shared memory 73 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 74 __syncthreads(); 75 // Contract x direction 76 if (p < NUM_POINTS) { 77 for (CeedInt i = 0; i < Q_1D; i++) { 78 atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 79 } 80 } 81 // Pull from shared to register 82 __syncthreads(); 83 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 84 } 85 } 86 87 //------------------------------------------------------------------------------ 88 // 1D derivatives at points 89 //------------------------------------------------------------------------------ 90 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 91 inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 92 CeedScalar *__restrict__ r_V) { 93 CeedScalar chebyshev_x[Q_1D]; 94 95 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 96 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 97 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 98 __syncthreads(); 99 // Load coefficients 100 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; 101 __syncthreads(); 102 // Contract x direction 103 for (CeedInt i = 0; i < Q_1D; i++) { 104 r_V[comp] += chebyshev_x[i] * data.slice[i]; 105 } 106 } 107 } 108 109 //------------------------------------------------------------------------------ 110 // 1D derivatives transpose 111 //------------------------------------------------------------------------------ 112 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 113 inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 114 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 115 CeedScalar chebyshev_x[Q_1D]; 116 117 ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 118 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 119 // Clear shared memory 120 if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; 121 __syncthreads(); 122 // Contract x direction 123 if (p < NUM_POINTS) { 124 for (CeedInt i = 0; i < Q_1D; i++) { 125 atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); 126 } 127 } 128 // Pull from shared to register 129 __syncthreads(); 130 if (data.t_id_x < Q_1D) r_C[comp] += data.slice[data.t_id_x]; 131 } 132 } 133 134 //------------------------------------------------------------------------------ 135 // 2D 136 //------------------------------------------------------------------------------ 137 138 //------------------------------------------------------------------------------ 139 // 2D interpolate to points 140 //------------------------------------------------------------------------------ 141 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 142 inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 143 CeedScalar *__restrict__ r_V) { 144 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 145 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 146 CeedScalar buffer[Q_1D]; 147 CeedScalar chebyshev_x[Q_1D]; 148 149 // Load coefficients 150 __syncthreads(); 151 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 152 __syncthreads(); 153 // Contract x direction 154 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 155 for (CeedInt i = 0; i < Q_1D; i++) { 156 buffer[i] = 0.0; 157 for (CeedInt j = 0; j < Q_1D; j++) { 158 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 159 } 160 } 161 // Contract y direction 162 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 163 for (CeedInt i = 0; i < Q_1D; i++) { 164 r_V[comp] += chebyshev_x[i] * buffer[i]; 165 } 166 } 167 } 168 169 //------------------------------------------------------------------------------ 170 // 2D interpolate transpose 171 //------------------------------------------------------------------------------ 172 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 173 inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 174 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 175 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 176 CeedScalar buffer[Q_1D]; 177 CeedScalar chebyshev_x[Q_1D]; 178 179 // Clear shared memory 180 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 181 __syncthreads(); 182 // Contract y direction 183 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 184 for (CeedInt i = 0; i < Q_1D; i++) { 185 buffer[i] = chebyshev_x[i] * r_U[comp]; 186 } 187 // Contract x direction 188 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 189 if (p < NUM_POINTS) { 190 for (CeedInt i = 0; i < Q_1D; i++) { 191 // Note: shifting to avoid atomic adds 192 const CeedInt ii = (i + data.t_id_x) % Q_1D; 193 194 for (CeedInt j = 0; j < Q_1D; j++) { 195 const CeedInt jj = (j + data.t_id_y) % Q_1D; 196 197 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 198 } 199 } 200 } 201 // Pull from shared to register 202 __syncthreads(); 203 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 204 } 205 } 206 207 //------------------------------------------------------------------------------ 208 // 2D derivatives at points 209 //------------------------------------------------------------------------------ 210 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 211 inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 212 CeedScalar *__restrict__ r_V) { 213 for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; 214 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 215 CeedScalar buffer[Q_1D]; 216 CeedScalar chebyshev_x[Q_1D]; 217 218 // Load coefficients 219 __syncthreads(); 220 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; 221 __syncthreads(); 222 for (CeedInt dim = 0; dim < 2; dim++) { 223 // Contract x direction 224 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 225 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 226 for (CeedInt i = 0; i < Q_1D; i++) { 227 buffer[i] = 0.0; 228 for (CeedInt j = 0; j < Q_1D; j++) { 229 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 230 } 231 } 232 // Contract y direction 233 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 234 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 235 for (CeedInt i = 0; i < Q_1D; i++) { 236 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; 237 } 238 } 239 } 240 } 241 242 //------------------------------------------------------------------------------ 243 // 2D derivatives transpose 244 //------------------------------------------------------------------------------ 245 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 246 inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 247 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 248 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 249 CeedScalar buffer[Q_1D]; 250 CeedScalar chebyshev_x[Q_1D]; 251 252 // Clear shared memory 253 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 254 __syncthreads(); 255 for (CeedInt dim = 0; dim < 2; dim++) { 256 // Contract y direction 257 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 258 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 259 for (CeedInt i = 0; i < Q_1D; i++) { 260 buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP]; 261 } 262 // Contract x direction 263 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 264 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 265 if (p < NUM_POINTS) { 266 for (CeedInt i = 0; i < Q_1D; i++) { 267 // Note: shifting to avoid atomic adds 268 const CeedInt ii = (i + data.t_id_x) % Q_1D; 269 270 for (CeedInt j = 0; j < Q_1D; j++) { 271 const CeedInt jj = (j + data.t_id_y) % Q_1D; 272 273 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 274 } 275 } 276 } 277 } 278 // Pull from shared to register 279 __syncthreads(); 280 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 281 } 282 } 283 284 //------------------------------------------------------------------------------ 285 // 3D 286 //------------------------------------------------------------------------------ 287 288 //------------------------------------------------------------------------------ 289 // 3D interpolate to points 290 //------------------------------------------------------------------------------ 291 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 292 inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 293 CeedScalar *__restrict__ r_V) { 294 for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; 295 for (CeedInt k = 0; k < Q_1D; k++) { 296 CeedScalar buffer[Q_1D]; 297 CeedScalar chebyshev_x[Q_1D]; 298 299 // Get z contraction value 300 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 301 const CeedScalar z = chebyshev_x[k]; 302 303 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 304 // Load coefficients 305 __syncthreads(); 306 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 307 __syncthreads(); 308 // Contract x direction 309 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 310 for (CeedInt i = 0; i < Q_1D; i++) { 311 buffer[i] = 0.0; 312 for (CeedInt j = 0; j < Q_1D; j++) { 313 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 314 } 315 } 316 // Contract y and z direction 317 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 318 for (CeedInt i = 0; i < Q_1D; i++) { 319 r_V[comp] += chebyshev_x[i] * buffer[i] * z; 320 } 321 } 322 } 323 } 324 325 //------------------------------------------------------------------------------ 326 // 3D interpolate transpose 327 //------------------------------------------------------------------------------ 328 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 329 inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 330 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 331 for (CeedInt k = 0; k < Q_1D; k++) { 332 CeedScalar buffer[Q_1D]; 333 CeedScalar chebyshev_x[Q_1D]; 334 335 // Get z contraction value 336 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 337 const CeedScalar z = chebyshev_x[k]; 338 339 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 340 // Clear shared memory 341 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 342 __syncthreads(); 343 // Contract y and z direction 344 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 345 for (CeedInt i = 0; i < Q_1D; i++) { 346 buffer[i] = chebyshev_x[i] * r_U[comp] * z; 347 } 348 // Contract x direction 349 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 350 if (p < NUM_POINTS) { 351 for (CeedInt i = 0; i < Q_1D; i++) { 352 // Note: shifting to avoid atomic adds 353 const CeedInt ii = (i + data.t_id_x) % Q_1D; 354 355 for (CeedInt j = 0; j < Q_1D; j++) { 356 const CeedInt jj = (j + data.t_id_y) % Q_1D; 357 358 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 359 } 360 } 361 } 362 // Pull from shared to register 363 __syncthreads(); 364 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 365 } 366 } 367 } 368 369 //------------------------------------------------------------------------------ 370 // 3D derivatives at points 371 //------------------------------------------------------------------------------ 372 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 373 inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, 374 CeedScalar *__restrict__ r_V) { 375 for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; 376 for (CeedInt k = 0; k < Q_1D; k++) { 377 CeedScalar buffer[Q_1D]; 378 CeedScalar chebyshev_x[Q_1D]; 379 380 // Get z contraction values 381 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 382 const CeedScalar z = chebyshev_x[k]; 383 384 ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 385 const CeedScalar dz = chebyshev_x[k]; 386 387 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 388 // Load coefficients 389 __syncthreads(); 390 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; 391 __syncthreads(); 392 // Gradient directions 393 for (CeedInt dim = 0; dim < 3; dim++) { 394 // Contract x direction 395 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 396 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 397 for (CeedInt i = 0; i < Q_1D; i++) { 398 buffer[i] = 0.0; 399 for (CeedInt j = 0; j < Q_1D; j++) { 400 buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; 401 } 402 } 403 // Contract y and z direction 404 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 405 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 406 const CeedScalar zz = dim == 2 ? dz : z; 407 408 for (CeedInt i = 0; i < Q_1D; i++) { 409 r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz; 410 } 411 } 412 } 413 } 414 } 415 416 //------------------------------------------------------------------------------ 417 // 3D derivatives transpose 418 //------------------------------------------------------------------------------ 419 template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D> 420 inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, 421 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { 422 for (CeedInt k = 0; k < Q_1D; k++) { 423 CeedScalar buffer[Q_1D]; 424 CeedScalar chebyshev_x[Q_1D]; 425 426 // Get z contraction values 427 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x); 428 const CeedScalar z = chebyshev_x[k]; 429 430 ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x); 431 const CeedScalar dz = chebyshev_x[k]; 432 433 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 434 // Clear shared memory 435 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; 436 __syncthreads(); 437 // Gradient directions 438 for (CeedInt dim = 0; dim < 3; dim++) { 439 // Contract y and z direction 440 if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x); 441 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x); 442 const CeedScalar zz = dim == 2 ? dz : z; 443 444 for (CeedInt i = 0; i < Q_1D; i++) { 445 buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz; 446 } 447 // Contract x direction 448 if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x); 449 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x); 450 if (p < NUM_POINTS) { 451 for (CeedInt i = 0; i < Q_1D; i++) { 452 // Note: shifting to avoid atomic adds 453 const CeedInt ii = (i + data.t_id_x) % Q_1D; 454 455 for (CeedInt j = 0; j < Q_1D; j++) { 456 const CeedInt jj = (j + data.t_id_y) % Q_1D; 457 458 atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); 459 } 460 } 461 } 462 } 463 // Pull from shared to register 464 __syncthreads(); 465 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; 466 } 467 } 468 } 469