1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP backend macro and type definitions for JiT source 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Load matrices for basis actions 14 //------------------------------------------------------------------------------ 15 template <int P, int Q> 16 inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 18 } 19 20 //------------------------------------------------------------------------------ 21 // AtPoints 22 //------------------------------------------------------------------------------ 23 24 //------------------------------------------------------------------------------ 25 // L-vector -> single point 26 //------------------------------------------------------------------------------ 27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 28 inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 29 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 30 const CeedInt ind = indices[p + elem * NUM_PTS]; 31 32 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 33 r_u[comp] = d_u[ind + comp * COMP_STRIDE]; 34 } 35 } 36 37 //------------------------------------------------------------------------------ 38 // Single point -> L-vector 39 //------------------------------------------------------------------------------ 40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 41 inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 42 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { 43 if (p < points_in_elem) { 44 const CeedInt ind = indices[p + elem * NUM_PTS]; 45 46 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 47 d_u[ind + comp * COMP_STRIDE] += r_u[comp]; 48 } 49 } 50 } 51 52 //------------------------------------------------------------------------------ 53 // 1D 54 //------------------------------------------------------------------------------ 55 56 //------------------------------------------------------------------------------ 57 // Set E-vector value 58 //------------------------------------------------------------------------------ 59 template <int NUM_COMP, int P_1D> 60 inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 61 const CeedInt target_comp = n / P_1D; 62 const CeedInt target_node = n % P_1D; 63 64 if (data.t_id_x == target_node) { 65 r_v[target_comp] = value; 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // L-vector -> E-vector, offsets provided 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 73 inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 74 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 75 if (data.t_id_x < P_1D) { 76 const CeedInt node = data.t_id_x; 77 const CeedInt ind = indices[node + elem * P_1D]; 78 79 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 80 } 81 } 82 83 //------------------------------------------------------------------------------ 84 // L-vector -> E-vector, strided 85 //------------------------------------------------------------------------------ 86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 87 inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 88 if (data.t_id_x < P_1D) { 89 const CeedInt node = data.t_id_x; 90 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 91 92 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 93 } 94 } 95 96 //------------------------------------------------------------------------------ 97 // E-vector -> L-vector, offsets provided 98 //------------------------------------------------------------------------------ 99 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 100 inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 101 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 102 if (data.t_id_x < P_1D) { 103 const CeedInt node = data.t_id_x; 104 const CeedInt ind = indices[node + elem * P_1D]; 105 106 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 107 } 108 } 109 110 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 111 inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 112 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 113 CeedScalar *__restrict__ d_v) { 114 const CeedInt target_comp = n / P_1D; 115 const CeedInt target_node = n % P_1D; 116 117 if (data.t_id_x == target_node) { 118 const CeedInt ind = indices[target_node + elem * P_1D]; 119 120 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 121 } 122 } 123 124 //------------------------------------------------------------------------------ 125 // E-vector -> L-vector, full assembly 126 //------------------------------------------------------------------------------ 127 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 128 inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 129 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 130 const CeedInt in_comp = in / P_1D; 131 const CeedInt in_node = in % P_1D; 132 const CeedInt e_vec_size = P_1D * NUM_COMP; 133 134 if (data.t_id_x < P_1D) { 135 const CeedInt out_node = data.t_id_x; 136 137 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 138 d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp]; 139 } 140 } 141 } 142 143 //------------------------------------------------------------------------------ 144 // E-vector -> L-vector, Qfunction assembly 145 //------------------------------------------------------------------------------ 146 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D> 147 inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, 148 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 149 if (data.t_id_x < Q_1D) { 150 const CeedInt ind = data.t_id_x + elem * Q_1D; 151 152 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { 153 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp]; 154 } 155 } 156 } 157 158 //------------------------------------------------------------------------------ 159 // E-vector -> L-vector, strided 160 //------------------------------------------------------------------------------ 161 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 162 inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 163 CeedScalar *__restrict__ d_v) { 164 if (data.t_id_x < P_1D) { 165 const CeedInt node = data.t_id_x; 166 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 167 168 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 169 } 170 } 171 172 //------------------------------------------------------------------------------ 173 // 2D 174 //------------------------------------------------------------------------------ 175 176 //------------------------------------------------------------------------------ 177 // Set E-vector value 178 //------------------------------------------------------------------------------ 179 template <int NUM_COMP, int P_1D> 180 inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 181 const CeedInt target_comp = n / (P_1D * P_1D); 182 const CeedInt target_node_x = n % P_1D; 183 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 184 185 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 186 r_v[target_comp] = value; 187 } 188 } 189 190 //------------------------------------------------------------------------------ 191 // L-vector -> E-vector, offsets provided 192 //------------------------------------------------------------------------------ 193 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 194 inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 195 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 196 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 197 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 198 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 199 200 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 201 } 202 } 203 204 //------------------------------------------------------------------------------ 205 // L-vector -> E-vector, strided 206 //------------------------------------------------------------------------------ 207 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 208 inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 209 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 210 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 211 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 212 213 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 214 } 215 } 216 217 //------------------------------------------------------------------------------ 218 // E-vector -> L-vector, offsets provided 219 //------------------------------------------------------------------------------ 220 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 221 inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 222 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 223 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 224 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 225 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 226 227 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 228 } 229 } 230 231 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 232 inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 233 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 234 CeedScalar *__restrict__ d_v) { 235 const CeedInt target_comp = n / (P_1D * P_1D); 236 const CeedInt target_node_x = n % P_1D; 237 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 238 239 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 240 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 241 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 242 243 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 244 } 245 } 246 247 //------------------------------------------------------------------------------ 248 // E-vector -> L-vector, full assembly 249 //------------------------------------------------------------------------------ 250 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 251 inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 252 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 253 const CeedInt elem_size = P_1D * P_1D; 254 const CeedInt in_comp = in / elem_size; 255 const CeedInt in_node_x = in % P_1D; 256 const CeedInt in_node_y = (in % elem_size) / P_1D; 257 const CeedInt e_vec_size = elem_size * NUM_COMP; 258 259 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 260 const CeedInt in_node = in_node_x + in_node_y * P_1D; 261 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D; 262 263 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 264 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; 265 266 d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp]; 267 } 268 } 269 } 270 271 //------------------------------------------------------------------------------ 272 // E-vector -> L-vector, Qfunction assembly 273 //------------------------------------------------------------------------------ 274 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D> 275 inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, 276 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 277 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 278 const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D; 279 280 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { 281 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp]; 282 } 283 } 284 } 285 286 //------------------------------------------------------------------------------ 287 // E-vector -> L-vector, strided 288 //------------------------------------------------------------------------------ 289 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 290 inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 291 CeedScalar *__restrict__ d_v) { 292 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 293 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 294 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 295 296 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 297 } 298 } 299 300 //------------------------------------------------------------------------------ 301 // 3D 302 //------------------------------------------------------------------------------ 303 304 //------------------------------------------------------------------------------ 305 // Set E-vector value 306 //------------------------------------------------------------------------------ 307 template <int NUM_COMP, int P_1D> 308 inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 309 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 310 const CeedInt target_node_x = n % P_1D; 311 const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; 312 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 313 314 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 315 r_v[target_node_z + target_comp * P_1D] = value; 316 } 317 } 318 319 //------------------------------------------------------------------------------ 320 // L-vector -> E-vector, offsets provided 321 //------------------------------------------------------------------------------ 322 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 323 inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 324 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 325 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 326 for (CeedInt z = 0; z < P_1D; z++) { 327 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 328 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 329 330 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp]; 331 } 332 } 333 } 334 335 //------------------------------------------------------------------------------ 336 // L-vector -> E-vector, strided 337 //------------------------------------------------------------------------------ 338 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 339 inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 340 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 341 for (CeedInt z = 0; z < P_1D; z++) { 342 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 343 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 344 345 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP]; 346 } 347 } 348 } 349 350 //------------------------------------------------------------------------------ 351 // E-vector -> Q-vector, offests provided 352 //------------------------------------------------------------------------------ 353 template <int NUM_COMP, int COMP_STRIDE, int Q_1D> 354 inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 355 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, 356 CeedScalar *__restrict__ r_u) { 357 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 358 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 359 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; 360 361 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 362 } 363 } 364 365 //------------------------------------------------------------------------------ 366 // E-vector -> Q-vector, strided 367 //------------------------------------------------------------------------------ 368 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 369 inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 370 CeedScalar *__restrict__ r_u) { 371 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 372 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 373 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 374 375 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 376 } 377 } 378 379 //------------------------------------------------------------------------------ 380 // E-vector -> L-vector, offsets provided 381 //------------------------------------------------------------------------------ 382 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 383 inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 384 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 385 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 386 for (CeedInt z = 0; z < P_1D; z++) { 387 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 388 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 389 390 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]); 391 } 392 } 393 } 394 395 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 396 inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 397 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 398 CeedScalar *__restrict__ d_v) { 399 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 400 const CeedInt target_node_x = n % P_1D; 401 const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; 402 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 403 404 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 405 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D; 406 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 407 408 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]); 409 } 410 } 411 412 //------------------------------------------------------------------------------ 413 // E-vector -> L-vector, full assembly 414 //------------------------------------------------------------------------------ 415 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 416 inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 417 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 418 const CeedInt elem_size = P_1D * P_1D * P_1D; 419 const CeedInt in_comp = in / elem_size; 420 const CeedInt in_node_x = in % P_1D; 421 const CeedInt in_node_y = (in % (P_1D * P_1D)) / P_1D; 422 const CeedInt in_node_z = (in % elem_size) / (P_1D * P_1D); 423 const CeedInt e_vec_size = elem_size * NUM_COMP; 424 425 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 426 const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D; 427 for (CeedInt z = 0; z < P_1D; z++) { 428 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 429 430 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 431 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; 432 433 d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D]; 434 } 435 } 436 } 437 } 438 439 //------------------------------------------------------------------------------ 440 // E-vector -> L-vector, Qfunction assembly 441 //------------------------------------------------------------------------------ 442 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D> 443 inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, 444 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 445 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 446 for (CeedInt z = 0; z < Q_1D; z++) { 447 const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D; 448 449 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { 450 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D]; 451 } 452 } 453 } 454 } 455 456 //------------------------------------------------------------------------------ 457 // E-vector -> L-vector, strided 458 //------------------------------------------------------------------------------ 459 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 460 inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 461 CeedScalar *__restrict__ d_v) { 462 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 463 for (CeedInt z = 0; z < P_1D; z++) { 464 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 465 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 466 467 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D]; 468 } 469 } 470 } 471 472 //------------------------------------------------------------------------------ 473 // 3D collocated derivatives computation 474 //------------------------------------------------------------------------------ 475 template <int NUM_COMP, int Q_1D, int T_1D> 476 inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 477 CeedScalar *__restrict__ r_V) { 478 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 479 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 480 __syncthreads(); 481 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D]; 482 __syncthreads(); 483 // X derivative 484 r_V[comp + 0 * NUM_COMP] = 0.0; 485 for (CeedInt i = 0; i < Q_1D; i++) { 486 r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 487 } 488 // Y derivative 489 r_V[comp + 1 * NUM_COMP] = 0.0; 490 for (CeedInt i = 0; i < Q_1D; i++) { 491 r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 492 } 493 // Z derivative 494 r_V[comp + 2 * NUM_COMP] = 0.0; 495 for (CeedInt i = 0; i < Q_1D; i++) { 496 r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; 497 } 498 } 499 } 500 } 501 502 //------------------------------------------------------------------------------ 503 // 3D collocated derivatives transpose 504 //------------------------------------------------------------------------------ 505 template <int NUM_COMP, int Q_1D, int T_1D> 506 inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 507 CeedScalar *__restrict__ r_V) { 508 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 509 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 510 // X derivative 511 __syncthreads(); 512 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; 513 __syncthreads(); 514 for (CeedInt i = 0; i < Q_1D; i++) { 515 r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 516 } 517 // Y derivative 518 __syncthreads(); 519 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; 520 __syncthreads(); 521 for (CeedInt i = 0; i < Q_1D; i++) { 522 r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 523 } 524 // Z derivative 525 for (CeedInt i = 0; i < Q_1D; i++) { 526 r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP]; 527 } 528 } 529 } 530 } 531