1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA backend macro and type definitions for JiT source 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Load matrices for basis actions 14 //------------------------------------------------------------------------------ 15 template <int P, int Q> 16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 18 } 19 20 //------------------------------------------------------------------------------ 21 // AtPoints 22 //------------------------------------------------------------------------------ 23 24 //------------------------------------------------------------------------------ 25 // L-vector -> single point 26 //------------------------------------------------------------------------------ 27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 28 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 29 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 30 const CeedInt ind = indices[p + elem * NUM_PTS]; 31 32 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 33 r_u[comp] = d_u[ind + comp * COMP_STRIDE]; 34 } 35 } 36 37 //------------------------------------------------------------------------------ 38 // Single point -> L-vector 39 //------------------------------------------------------------------------------ 40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 41 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 42 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { 43 if (p < points_in_elem) { 44 const CeedInt ind = indices[p + elem * NUM_PTS]; 45 46 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 47 d_u[ind + comp * COMP_STRIDE] += r_u[comp]; 48 } 49 } 50 } 51 52 //------------------------------------------------------------------------------ 53 // 1D 54 //------------------------------------------------------------------------------ 55 56 //------------------------------------------------------------------------------ 57 // Set E-vector value 58 //------------------------------------------------------------------------------ 59 template <int NUM_COMP, int P_1D> 60 inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 61 const CeedInt target_comp = n / P_1D; 62 const CeedInt target_node = n % P_1D; 63 64 if (data.t_id_x == target_node) { 65 r_v[target_comp] = value; 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // L-vector -> E-vector, offsets provided 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 73 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 74 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 75 if (data.t_id_x < P_1D) { 76 const CeedInt node = data.t_id_x; 77 const CeedInt ind = indices[node + elem * P_1D]; 78 79 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 80 } 81 } 82 83 //------------------------------------------------------------------------------ 84 // L-vector -> E-vector, strided 85 //------------------------------------------------------------------------------ 86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 87 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 88 CeedScalar *__restrict__ r_u) { 89 if (data.t_id_x < P_1D) { 90 const CeedInt node = data.t_id_x; 91 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 92 93 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 94 } 95 } 96 97 //------------------------------------------------------------------------------ 98 // E-vector -> L-vector, offsets provided 99 //------------------------------------------------------------------------------ 100 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 101 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 102 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 103 if (data.t_id_x < P_1D) { 104 const CeedInt node = data.t_id_x; 105 const CeedInt ind = indices[node + elem * P_1D]; 106 107 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 108 } 109 } 110 111 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 112 inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 113 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 114 CeedScalar *__restrict__ d_v) { 115 const CeedInt target_comp = n / P_1D; 116 const CeedInt target_node = n % P_1D; 117 118 if (data.t_id_x == target_node) { 119 const CeedInt ind = indices[target_node + elem * P_1D]; 120 121 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 122 } 123 } 124 125 //------------------------------------------------------------------------------ 126 // E-vector -> L-vector, full assembly 127 //------------------------------------------------------------------------------ 128 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 129 inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 130 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 131 const CeedInt in_comp = in / P_1D; 132 const CeedInt in_node = in % P_1D; 133 const CeedInt e_vec_size = P_1D * NUM_COMP; 134 135 if (data.t_id_x < P_1D) { 136 const CeedInt out_node = data.t_id_x; 137 138 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 139 d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp]; 140 } 141 } 142 } 143 144 //------------------------------------------------------------------------------ 145 // E-vector -> L-vector, strided 146 //------------------------------------------------------------------------------ 147 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 148 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 149 CeedScalar *__restrict__ d_v) { 150 if (data.t_id_x < P_1D) { 151 const CeedInt node = data.t_id_x; 152 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 153 154 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 155 } 156 } 157 158 //------------------------------------------------------------------------------ 159 // 2D 160 //------------------------------------------------------------------------------ 161 162 //------------------------------------------------------------------------------ 163 // Set E-vector value 164 //------------------------------------------------------------------------------ 165 template <int NUM_COMP, int P_1D> 166 inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 167 const CeedInt target_comp = n / (P_1D * P_1D); 168 const CeedInt target_node_x = n % P_1D; 169 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 170 171 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 172 r_v[target_comp] = value; 173 } 174 } 175 176 //------------------------------------------------------------------------------ 177 // L-vector -> E-vector, offsets provided 178 //------------------------------------------------------------------------------ 179 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 180 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 181 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 182 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 183 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 184 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 185 186 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 187 } 188 } 189 190 //------------------------------------------------------------------------------ 191 // L-vector -> E-vector, strided 192 //------------------------------------------------------------------------------ 193 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 194 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 195 CeedScalar *__restrict__ r_u) { 196 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 197 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 198 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 199 200 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 201 } 202 } 203 204 //------------------------------------------------------------------------------ 205 // E-vector -> L-vector, offsets provided 206 //------------------------------------------------------------------------------ 207 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 208 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 209 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 210 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 211 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 212 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 213 214 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 215 } 216 } 217 218 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 219 inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 220 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 221 CeedScalar *__restrict__ d_v) { 222 const CeedInt target_comp = n / (P_1D * P_1D); 223 const CeedInt target_node_x = n % P_1D; 224 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 225 226 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 227 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 228 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 229 230 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 231 } 232 } 233 234 //------------------------------------------------------------------------------ 235 // E-vector -> L-vector, full assembly 236 //------------------------------------------------------------------------------ 237 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 238 inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 239 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 240 const CeedInt elem_size = P_1D * P_1D; 241 const CeedInt in_comp = in / elem_size; 242 const CeedInt in_node_x = in % P_1D; 243 const CeedInt in_node_y = (in % elem_size) / P_1D; 244 const CeedInt e_vec_size = elem_size * NUM_COMP; 245 246 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 247 const CeedInt in_node = in_node_x + in_node_y * P_1D; 248 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D; 249 250 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 251 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; 252 253 d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp]; 254 } 255 } 256 } 257 258 //------------------------------------------------------------------------------ 259 // E-vector -> L-vector, strided 260 //------------------------------------------------------------------------------ 261 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 262 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 263 CeedScalar *__restrict__ d_v) { 264 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 265 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 266 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 267 268 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 269 } 270 } 271 272 //------------------------------------------------------------------------------ 273 // 3D 274 //------------------------------------------------------------------------------ 275 276 //------------------------------------------------------------------------------ 277 // Set E-vector value 278 //------------------------------------------------------------------------------ 279 template <int NUM_COMP, int P_1D> 280 inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 281 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 282 const CeedInt target_node_x = n % P_1D; 283 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 284 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 285 286 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 287 r_v[target_node_z + target_comp * P_1D] = value; 288 } 289 } 290 291 //------------------------------------------------------------------------------ 292 // L-vector -> E-vector, offsets provided 293 //------------------------------------------------------------------------------ 294 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 295 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 296 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 297 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 298 for (CeedInt z = 0; z < P_1D; z++) { 299 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 300 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 301 302 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp]; 303 } 304 } 305 } 306 307 //------------------------------------------------------------------------------ 308 // L-vector -> E-vector, strided 309 //------------------------------------------------------------------------------ 310 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 311 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 312 CeedScalar *__restrict__ r_u) { 313 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 314 for (CeedInt z = 0; z < P_1D; z++) { 315 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 316 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 317 318 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP]; 319 } 320 } 321 } 322 323 //------------------------------------------------------------------------------ 324 // E-vector -> Q-vector, offests provided 325 //------------------------------------------------------------------------------ 326 template <int NUM_COMP, int COMP_STRIDE, int Q_1D> 327 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 328 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, 329 CeedScalar *__restrict__ r_u) { 330 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 331 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 332 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; 333 334 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 335 } 336 } 337 338 //------------------------------------------------------------------------------ 339 // E-vector -> Q-vector, strided 340 //------------------------------------------------------------------------------ 341 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 342 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 343 CeedScalar *__restrict__ r_u) { 344 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 345 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 346 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 347 348 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 349 } 350 } 351 352 //------------------------------------------------------------------------------ 353 // E-vector -> L-vector, offsets provided 354 //------------------------------------------------------------------------------ 355 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 356 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 357 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 358 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 359 for (CeedInt z = 0; z < P_1D; z++) { 360 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 361 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 362 363 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]); 364 } 365 } 366 } 367 368 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 369 inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 370 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 371 CeedScalar *__restrict__ d_v) { 372 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 373 const CeedInt target_node_x = n % P_1D; 374 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 375 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 376 377 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 378 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D; 379 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 380 381 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]); 382 } 383 } 384 385 //------------------------------------------------------------------------------ 386 // E-vector -> L-vector, full assembly 387 //------------------------------------------------------------------------------ 388 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 389 inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 390 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 391 const CeedInt elem_size = P_1D * P_1D * P_1D; 392 const CeedInt in_comp = in / elem_size; 393 const CeedInt in_node_x = in % P_1D; 394 const CeedInt in_node_y = (in % (P_1D * P_1D)) / P_1D; 395 const CeedInt in_node_z = (in % elem_size) / (P_1D * P_1D); 396 const CeedInt e_vec_size = elem_size * NUM_COMP; 397 398 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 399 const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D; 400 for (CeedInt z = 0; z < P_1D; z++) { 401 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 402 403 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 404 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; 405 406 d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D]; 407 } 408 } 409 } 410 } 411 412 //------------------------------------------------------------------------------ 413 // E-vector -> L-vector, strided 414 //------------------------------------------------------------------------------ 415 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 416 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 417 CeedScalar *__restrict__ d_v) { 418 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 419 for (CeedInt z = 0; z < P_1D; z++) { 420 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 421 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 422 423 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D]; 424 } 425 } 426 } 427 428 //------------------------------------------------------------------------------ 429 // 3D collocated derivatives computation 430 //------------------------------------------------------------------------------ 431 template <int NUM_COMP, int Q_1D, int T_1D> 432 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 433 CeedScalar *__restrict__ r_V) { 434 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 435 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 436 __syncthreads(); 437 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D]; 438 __syncthreads(); 439 // X derivative 440 r_V[comp + 0 * NUM_COMP] = 0.0; 441 for (CeedInt i = 0; i < Q_1D; i++) { 442 r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 443 } 444 // Y derivative 445 r_V[comp + 1 * NUM_COMP] = 0.0; 446 for (CeedInt i = 0; i < Q_1D; i++) { 447 r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 448 } 449 // Z derivative 450 r_V[comp + 2 * NUM_COMP] = 0.0; 451 for (CeedInt i = 0; i < Q_1D; i++) { 452 r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; 453 } 454 } 455 } 456 } 457 458 //------------------------------------------------------------------------------ 459 // 3D collocated derivatives transpose 460 //------------------------------------------------------------------------------ 461 template <int NUM_COMP, int Q_1D, int T_1D> 462 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 463 CeedScalar *__restrict__ r_V) { 464 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 465 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 466 __syncthreads(); 467 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; 468 __syncthreads(); 469 // X derivative 470 for (CeedInt i = 0; i < Q_1D; i++) { 471 r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 472 } 473 // Y derivative 474 __syncthreads(); 475 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; 476 __syncthreads(); 477 for (CeedInt i = 0; i < Q_1D; i++) { 478 r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 479 } 480 // Z derivative 481 for (CeedInt i = 0; i < Q_1D; i++) { 482 r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP]; 483 } 484 } 485 } 486 } 487