1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP backend macro and type definitions for JiT source 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Load matrices for basis actions 14 //------------------------------------------------------------------------------ 15 template <int P, int Q> 16 inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 18 } 19 20 //------------------------------------------------------------------------------ 21 // AtPoints 22 //------------------------------------------------------------------------------ 23 24 //------------------------------------------------------------------------------ 25 // L-vector -> single point 26 //------------------------------------------------------------------------------ 27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 28 inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 29 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 30 const CeedInt ind = indices[p + elem * NUM_PTS]; 31 32 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 33 r_u[comp] = d_u[ind + comp * COMP_STRIDE]; 34 } 35 } 36 37 //------------------------------------------------------------------------------ 38 // Single point -> L-vector 39 //------------------------------------------------------------------------------ 40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 41 inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 42 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { 43 if (p < points_in_elem) { 44 const CeedInt ind = indices[p + elem * NUM_PTS]; 45 46 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 47 d_u[ind + comp * COMP_STRIDE] += r_u[comp]; 48 } 49 } 50 } 51 52 //------------------------------------------------------------------------------ 53 // 1D 54 //------------------------------------------------------------------------------ 55 56 //------------------------------------------------------------------------------ 57 // Set E-vector value 58 //------------------------------------------------------------------------------ 59 template <int NUM_COMP, int P_1D> 60 inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 61 const CeedInt target_comp = n / P_1D; 62 const CeedInt target_node = n % P_1D; 63 64 if (data.t_id_x == target_node) { 65 r_v[target_comp] = value; 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // L-vector -> E-vector, offsets provided 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 73 inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 74 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 75 if (data.t_id_x < P_1D) { 76 const CeedInt node = data.t_id_x; 77 const CeedInt ind = indices[node + elem * P_1D]; 78 79 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 80 } 81 } 82 83 //------------------------------------------------------------------------------ 84 // L-vector -> E-vector, strided 85 //------------------------------------------------------------------------------ 86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 87 inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 88 if (data.t_id_x < P_1D) { 89 const CeedInt node = data.t_id_x; 90 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 91 92 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 93 } 94 } 95 96 //------------------------------------------------------------------------------ 97 // E-vector -> L-vector, offsets provided 98 //------------------------------------------------------------------------------ 99 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 100 inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 101 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 102 if (data.t_id_x < P_1D) { 103 const CeedInt node = data.t_id_x; 104 const CeedInt ind = indices[node + elem * P_1D]; 105 106 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 107 } 108 } 109 110 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 111 inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 112 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 113 CeedScalar *__restrict__ d_v) { 114 const CeedInt target_comp = n / P_1D; 115 const CeedInt target_node = n % P_1D; 116 117 if (data.t_id_x == target_node) { 118 const CeedInt ind = indices[target_node + elem * P_1D]; 119 120 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 121 } 122 } 123 124 //------------------------------------------------------------------------------ 125 // E-vector -> L-vector, strided 126 //------------------------------------------------------------------------------ 127 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 128 inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 129 CeedScalar *__restrict__ d_v) { 130 if (data.t_id_x < P_1D) { 131 const CeedInt node = data.t_id_x; 132 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 133 134 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 135 } 136 } 137 138 //------------------------------------------------------------------------------ 139 // 2D 140 //------------------------------------------------------------------------------ 141 142 //------------------------------------------------------------------------------ 143 // Set E-vector value 144 //------------------------------------------------------------------------------ 145 template <int NUM_COMP, int P_1D> 146 inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 147 const CeedInt target_comp = n / (P_1D * P_1D); 148 const CeedInt target_node_x = n % P_1D; 149 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 150 151 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 152 r_v[target_comp] = value; 153 } 154 } 155 156 //------------------------------------------------------------------------------ 157 // L-vector -> E-vector, offsets provided 158 //------------------------------------------------------------------------------ 159 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 160 inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 161 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 162 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 163 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 164 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 165 166 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 167 } 168 } 169 170 //------------------------------------------------------------------------------ 171 // L-vector -> E-vector, strided 172 //------------------------------------------------------------------------------ 173 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 174 inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 175 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 176 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 177 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 178 179 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 180 } 181 } 182 183 //------------------------------------------------------------------------------ 184 // E-vector -> L-vector, offsets provided 185 //------------------------------------------------------------------------------ 186 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 187 inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 188 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 189 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 190 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 191 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 192 193 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 194 } 195 } 196 197 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 198 inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 199 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 200 CeedScalar *__restrict__ d_v) { 201 const CeedInt target_comp = n / (P_1D * P_1D); 202 const CeedInt target_node_x = n % P_1D; 203 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 204 205 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 206 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 207 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 208 209 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 210 } 211 } 212 213 //------------------------------------------------------------------------------ 214 // E-vector -> L-vector, strided 215 //------------------------------------------------------------------------------ 216 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 217 inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 218 CeedScalar *__restrict__ d_v) { 219 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 220 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 221 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 222 223 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 224 } 225 } 226 227 //------------------------------------------------------------------------------ 228 // 3D 229 //------------------------------------------------------------------------------ 230 231 //------------------------------------------------------------------------------ 232 // Set E-vector value 233 //------------------------------------------------------------------------------ 234 template <int NUM_COMP, int P_1D> 235 inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 236 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 237 const CeedInt target_node_x = n % P_1D; 238 const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; 239 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 240 241 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 242 r_v[target_node_z + target_comp * P_1D] = value; 243 } 244 } 245 246 //------------------------------------------------------------------------------ 247 // L-vector -> E-vector, offsets provided 248 //------------------------------------------------------------------------------ 249 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 250 inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 251 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 252 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 253 for (CeedInt z = 0; z < P_1D; z++) { 254 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 255 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 256 257 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp]; 258 } 259 } 260 } 261 262 //------------------------------------------------------------------------------ 263 // L-vector -> E-vector, strided 264 //------------------------------------------------------------------------------ 265 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 266 inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 267 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 268 for (CeedInt z = 0; z < P_1D; z++) { 269 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 270 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 271 272 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP]; 273 } 274 } 275 } 276 277 //------------------------------------------------------------------------------ 278 // E-vector -> Q-vector, offests provided 279 //------------------------------------------------------------------------------ 280 template <int NUM_COMP, int COMP_STRIDE, int Q_1D> 281 inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 282 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, 283 CeedScalar *__restrict__ r_u) { 284 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 285 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 286 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; 287 288 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 289 } 290 } 291 292 //------------------------------------------------------------------------------ 293 // E-vector -> Q-vector, strided 294 //------------------------------------------------------------------------------ 295 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 296 inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 297 CeedScalar *__restrict__ r_u) { 298 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 299 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 300 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 301 302 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 303 } 304 } 305 306 //------------------------------------------------------------------------------ 307 // E-vector -> L-vector, offsets provided 308 //------------------------------------------------------------------------------ 309 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 310 inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 311 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 312 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 313 for (CeedInt z = 0; z < P_1D; z++) { 314 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 315 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 316 317 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]); 318 } 319 } 320 } 321 322 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 323 inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 324 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 325 CeedScalar *__restrict__ d_v) { 326 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 327 const CeedInt target_node_x = n % P_1D; 328 const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D; 329 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 330 331 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 332 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D; 333 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 334 335 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]); 336 } 337 } 338 339 //------------------------------------------------------------------------------ 340 // E-vector -> L-vector, strided 341 //------------------------------------------------------------------------------ 342 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 343 inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 344 CeedScalar *__restrict__ d_v) { 345 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 346 for (CeedInt z = 0; z < P_1D; z++) { 347 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 348 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 349 350 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D]; 351 } 352 } 353 } 354 355 //------------------------------------------------------------------------------ 356 // 3D collocated derivatives computation 357 //------------------------------------------------------------------------------ 358 template <int NUM_COMP, int Q_1D, int T_1D> 359 inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 360 CeedScalar *__restrict__ r_V) { 361 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 362 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 363 __syncthreads(); 364 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D]; 365 __syncthreads(); 366 // X derivative 367 r_V[comp + 0 * NUM_COMP] = 0.0; 368 for (CeedInt i = 0; i < Q_1D; i++) { 369 r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 370 } 371 // Y derivative 372 r_V[comp + 1 * NUM_COMP] = 0.0; 373 for (CeedInt i = 0; i < Q_1D; i++) { 374 r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 375 } 376 // Z derivative 377 r_V[comp + 2 * NUM_COMP] = 0.0; 378 for (CeedInt i = 0; i < Q_1D; i++) { 379 r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; 380 } 381 } 382 } 383 } 384 385 //------------------------------------------------------------------------------ 386 // 3D collocated derivatives transpose 387 //------------------------------------------------------------------------------ 388 template <int NUM_COMP, int Q_1D, int T_1D> 389 inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 390 CeedScalar *__restrict__ r_V) { 391 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 392 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 393 // X derivative 394 __syncthreads(); 395 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; 396 __syncthreads(); 397 for (CeedInt i = 0; i < Q_1D; i++) { 398 r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 399 } 400 // Y derivative 401 __syncthreads(); 402 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; 403 __syncthreads(); 404 for (CeedInt i = 0; i < Q_1D; i++) { 405 r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 406 } 407 // Z derivative 408 for (CeedInt i = 0; i < Q_1D; i++) { 409 r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP]; 410 } 411 } 412 } 413 } 414