1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP backend macro and type definitions for JiT source 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Load matrices for basis actions 14 //------------------------------------------------------------------------------ 15 template <int P, int Q> 16 inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 18 } 19 20 //------------------------------------------------------------------------------ 21 // AtPoints 22 //------------------------------------------------------------------------------ 23 24 //------------------------------------------------------------------------------ 25 // L-vector -> single point 26 //------------------------------------------------------------------------------ 27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 28 inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 29 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 30 const CeedInt ind = indices[p + elem * NUM_PTS]; 31 32 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 33 r_u[comp] = d_u[ind + comp * COMP_STRIDE]; 34 } 35 } 36 37 //------------------------------------------------------------------------------ 38 // Single point -> L-vector 39 //------------------------------------------------------------------------------ 40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 41 inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 42 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { 43 if (p < points_in_elem) { 44 const CeedInt ind = indices[p + elem * NUM_PTS]; 45 46 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 47 d_u[ind + comp * COMP_STRIDE] += r_u[comp]; 48 } 49 } 50 } 51 52 //------------------------------------------------------------------------------ 53 // 1D 54 //------------------------------------------------------------------------------ 55 56 //------------------------------------------------------------------------------ 57 // Set E-vector value 58 //------------------------------------------------------------------------------ 59 template <int NUM_COMP, int P_1D> 60 inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 61 const CeedInt target_comp = n / P_1D; 62 const CeedInt target_node = n % P_1D; 63 64 if (data.t_id_x == target_node) { 65 r_v[target_comp] = value; 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // L-vector -> E-vector, offsets provided 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 73 inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 74 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 75 if (data.t_id_x < P_1D) { 76 const CeedInt node = data.t_id_x; 77 const CeedInt ind = indices[node + elem * P_1D]; 78 79 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 80 } 81 } 82 83 //------------------------------------------------------------------------------ 84 // L-vector -> E-vector, strided 85 //------------------------------------------------------------------------------ 86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 87 inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 88 if (data.t_id_x < P_1D) { 89 const CeedInt node = data.t_id_x; 90 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 91 92 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 93 } 94 } 95 96 //------------------------------------------------------------------------------ 97 // E-vector -> L-vector, offsets provided 98 //------------------------------------------------------------------------------ 99 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 100 inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 101 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 102 if (data.t_id_x < P_1D) { 103 const CeedInt node = data.t_id_x; 104 const CeedInt ind = indices[node + elem * P_1D]; 105 106 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 107 } 108 } 109 110 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 111 inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 112 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 113 CeedScalar *__restrict__ d_v) { 114 const CeedInt target_comp = n / P_1D; 115 const CeedInt target_node = n % P_1D; 116 117 if (data.t_id_x == target_node) { 118 const CeedInt ind = indices[target_node + elem * P_1D]; 119 120 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 121 } 122 } 123 124 //------------------------------------------------------------------------------ 125 // E-vector -> L-vector, full assembly 126 //------------------------------------------------------------------------------ 127 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 128 inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 129 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 130 const CeedInt in_comp = in / P_1D; 131 const CeedInt in_node = in % P_1D; 132 const CeedInt e_vec_size = P_1D * NUM_COMP; 133 134 if (data.t_id_x < P_1D) { 135 const CeedInt out_node = data.t_id_x; 136 137 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138 d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp]; 139 } 140 } 141 } 142 143 //------------------------------------------------------------------------------ 144 // E-vector -> L-vector, strided 145 //------------------------------------------------------------------------------ 146 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 147 inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 148 CeedScalar *__restrict__ d_v) { 149 if (data.t_id_x < P_1D) { 150 const CeedInt node = data.t_id_x; 151 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 152 153 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 154 } 155 } 156 157 //------------------------------------------------------------------------------ 158 // 2D 159 //------------------------------------------------------------------------------ 160 161 //------------------------------------------------------------------------------ 162 // Set E-vector value 163 //------------------------------------------------------------------------------ 164 template <int NUM_COMP, int P_1D> 165 inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 166 const CeedInt target_comp = n / (P_1D * P_1D); 167 const CeedInt target_node_x = n % P_1D; 168 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 169 170 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 171 r_v[target_comp] = value; 172 } 173 } 174 175 //------------------------------------------------------------------------------ 176 // L-vector -> E-vector, offsets provided 177 //------------------------------------------------------------------------------ 178 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 179 inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 180 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 181 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 182 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 183 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 184 185 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 186 } 187 } 188 189 //------------------------------------------------------------------------------ 190 // L-vector -> E-vector, strided 191 //------------------------------------------------------------------------------ 192 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 193 inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 194 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 195 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 196 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 197 198 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 199 } 200 } 201 202 //------------------------------------------------------------------------------ 203 // E-vector -> L-vector, offsets provided 204 //------------------------------------------------------------------------------ 205 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 206 inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 207 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 208 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 209 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 210 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 211 212 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 213 } 214 } 215 216 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 217 inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 218 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 219 CeedScalar *__restrict__ d_v) { 220 const CeedInt target_comp = n / (P_1D * P_1D); 221 const CeedInt target_node_x = n % P_1D; 222 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 223 224 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 225 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 226 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 227 228 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 229 } 230 } 231 232 //------------------------------------------------------------------------------ 233 // E-vector -> L-vector, full assembly 234 //------------------------------------------------------------------------------ 235 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 236 inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 237 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 238 const CeedInt elem_size = P_1D * P_1D; 239 const CeedInt in_comp = in / elem_size; 240 const CeedInt in_node_x = in % P_1D; 241 const CeedInt in_node_y = (in % elem_size) / P_1D; 242 const CeedInt e_vec_size = elem_size * NUM_COMP; 243 244 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 245 const CeedInt in_node = in_node_x + in_node_y * P_1D; 246 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D; 247 248 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 249 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; 250 251 d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp]; 252 } 253 } 254 } 255 256 //------------------------------------------------------------------------------ 257 // E-vector -> L-vector, strided 258 //------------------------------------------------------------------------------ 259 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 260 inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 261 CeedScalar *__restrict__ d_v) { 262 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 263 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 264 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 265 266 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 267 } 268 } 269 270 //------------------------------------------------------------------------------ 271 // 3D 272 //------------------------------------------------------------------------------ 273 274 //------------------------------------------------------------------------------ 275 // Set E-vector value 276 //------------------------------------------------------------------------------ 277 template <int NUM_COMP, int P_1D> 278 inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 279 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 280 const CeedInt target_node_x = n % P_1D; 281 const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; 282 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 283 284 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 285 r_v[target_node_z + target_comp * P_1D] = value; 286 } 287 } 288 289 //------------------------------------------------------------------------------ 290 // L-vector -> E-vector, offsets provided 291 //------------------------------------------------------------------------------ 292 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 293 inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 294 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 295 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 296 for (CeedInt z = 0; z < P_1D; z++) { 297 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 298 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 299 300 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp]; 301 } 302 } 303 } 304 305 //------------------------------------------------------------------------------ 306 // L-vector -> E-vector, strided 307 //------------------------------------------------------------------------------ 308 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 309 inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 310 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 311 for (CeedInt z = 0; z < P_1D; z++) { 312 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 313 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 314 315 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP]; 316 } 317 } 318 } 319 320 //------------------------------------------------------------------------------ 321 // E-vector -> Q-vector, offests provided 322 //------------------------------------------------------------------------------ 323 template <int NUM_COMP, int COMP_STRIDE, int Q_1D> 324 inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 325 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, 326 CeedScalar *__restrict__ r_u) { 327 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 328 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 329 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; 330 331 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 332 } 333 } 334 335 //------------------------------------------------------------------------------ 336 // E-vector -> Q-vector, strided 337 //------------------------------------------------------------------------------ 338 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 339 inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 340 CeedScalar *__restrict__ r_u) { 341 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 342 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 343 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 344 345 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 346 } 347 } 348 349 //------------------------------------------------------------------------------ 350 // E-vector -> L-vector, offsets provided 351 //------------------------------------------------------------------------------ 352 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 353 inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 354 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 355 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 356 for (CeedInt z = 0; z < P_1D; z++) { 357 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 358 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 359 360 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]); 361 } 362 } 363 } 364 365 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 366 inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 367 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 368 CeedScalar *__restrict__ d_v) { 369 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 370 const CeedInt target_node_x = n % P_1D; 371 const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; 372 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 373 374 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 375 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D; 376 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 377 378 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]); 379 } 380 } 381 382 //------------------------------------------------------------------------------ 383 // E-vector -> L-vector, full assembly 384 //------------------------------------------------------------------------------ 385 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 386 inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 387 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 388 const CeedInt elem_size = P_1D * P_1D * P_1D; 389 const CeedInt in_comp = in / elem_size; 390 const CeedInt in_node_x = in % P_1D; 391 const CeedInt in_node_y = (in % (P_1D * P_1D)) / P_1D; 392 const CeedInt in_node_z = (in % elem_size) / (P_1D * P_1D); 393 const CeedInt e_vec_size = elem_size * NUM_COMP; 394 395 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 396 const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D; 397 for (CeedInt z = 0; z < P_1D; z++) { 398 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 399 400 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 401 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; 402 403 d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D]; 404 } 405 } 406 } 407 } 408 409 //------------------------------------------------------------------------------ 410 // E-vector -> L-vector, strided 411 //------------------------------------------------------------------------------ 412 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 413 inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 414 CeedScalar *__restrict__ d_v) { 415 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 416 for (CeedInt z = 0; z < P_1D; z++) { 417 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 418 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 419 420 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D]; 421 } 422 } 423 } 424 425 //------------------------------------------------------------------------------ 426 // 3D collocated derivatives computation 427 //------------------------------------------------------------------------------ 428 template <int NUM_COMP, int Q_1D, int T_1D> 429 inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 430 CeedScalar *__restrict__ r_V) { 431 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 432 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 433 __syncthreads(); 434 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D]; 435 __syncthreads(); 436 // X derivative 437 r_V[comp + 0 * NUM_COMP] = 0.0; 438 for (CeedInt i = 0; i < Q_1D; i++) { 439 r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 440 } 441 // Y derivative 442 r_V[comp + 1 * NUM_COMP] = 0.0; 443 for (CeedInt i = 0; i < Q_1D; i++) { 444 r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 445 } 446 // Z derivative 447 r_V[comp + 2 * NUM_COMP] = 0.0; 448 for (CeedInt i = 0; i < Q_1D; i++) { 449 r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; 450 } 451 } 452 } 453 } 454 455 //------------------------------------------------------------------------------ 456 // 3D collocated derivatives transpose 457 //------------------------------------------------------------------------------ 458 template <int NUM_COMP, int Q_1D, int T_1D> 459 inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 460 CeedScalar *__restrict__ r_V) { 461 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 462 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 463 // X derivative 464 __syncthreads(); 465 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; 466 __syncthreads(); 467 for (CeedInt i = 0; i < Q_1D; i++) { 468 r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 469 } 470 // Y derivative 471 __syncthreads(); 472 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; 473 __syncthreads(); 474 for (CeedInt i = 0; i < Q_1D; i++) { 475 r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 476 } 477 // Z derivative 478 for (CeedInt i = 0; i < Q_1D; i++) { 479 r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP]; 480 } 481 } 482 } 483 } 484