1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA backend macro and type definitions for JiT source 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Load matrices for basis actions 14 //------------------------------------------------------------------------------ 15 template <int P, int Q> 16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 18 } 19 20 //------------------------------------------------------------------------------ 21 // AtPoints 22 //------------------------------------------------------------------------------ 23 24 //------------------------------------------------------------------------------ 25 // L-vector -> single point 26 //------------------------------------------------------------------------------ 27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 28 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 29 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 30 const CeedInt ind = indices[p + elem * NUM_PTS]; 31 32 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 33 r_u[comp] = d_u[ind + comp * COMP_STRIDE]; 34 } 35 } 36 37 //------------------------------------------------------------------------------ 38 // Single point -> L-vector 39 //------------------------------------------------------------------------------ 40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 41 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 42 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { 43 if (p < points_in_elem) { 44 const CeedInt ind = indices[p + elem * NUM_PTS]; 45 46 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 47 d_u[ind + comp * COMP_STRIDE] += r_u[comp]; 48 } 49 } 50 } 51 52 //------------------------------------------------------------------------------ 53 // 1D 54 //------------------------------------------------------------------------------ 55 56 //------------------------------------------------------------------------------ 57 // Set E-vector value 58 //------------------------------------------------------------------------------ 59 template <int NUM_COMP, int P_1D> 60 inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 61 const CeedInt target_comp = n / P_1D; 62 const CeedInt target_node = n % P_1D; 63 64 if (data.t_id_x == target_node) { 65 r_v[target_comp] = value; 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // L-vector -> E-vector, offsets provided 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 73 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 74 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 75 if (data.t_id_x < P_1D) { 76 const CeedInt node = data.t_id_x; 77 const CeedInt ind = indices[node + elem * P_1D]; 78 79 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 80 } 81 } 82 83 //------------------------------------------------------------------------------ 84 // L-vector -> E-vector, strided 85 //------------------------------------------------------------------------------ 86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 87 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 88 CeedScalar *__restrict__ r_u) { 89 if (data.t_id_x < P_1D) { 90 const CeedInt node = data.t_id_x; 91 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 92 93 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 94 } 95 } 96 97 //------------------------------------------------------------------------------ 98 // E-vector -> L-vector, offsets provided 99 //------------------------------------------------------------------------------ 100 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 101 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 102 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 103 if (data.t_id_x < P_1D) { 104 const CeedInt node = data.t_id_x; 105 const CeedInt ind = indices[node + elem * P_1D]; 106 107 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 108 } 109 } 110 111 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 112 inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 113 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 114 CeedScalar *__restrict__ d_v) { 115 const CeedInt target_comp = n / P_1D; 116 const CeedInt target_node = n % P_1D; 117 118 if (data.t_id_x == target_node) { 119 const CeedInt ind = indices[target_node + elem * P_1D]; 120 121 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 122 } 123 } 124 125 //------------------------------------------------------------------------------ 126 // E-vector -> L-vector, strided 127 //------------------------------------------------------------------------------ 128 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 129 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 130 CeedScalar *__restrict__ d_v) { 131 if (data.t_id_x < P_1D) { 132 const CeedInt node = data.t_id_x; 133 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 134 135 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 136 } 137 } 138 139 //------------------------------------------------------------------------------ 140 // 2D 141 //------------------------------------------------------------------------------ 142 143 //------------------------------------------------------------------------------ 144 // Set E-vector value 145 //------------------------------------------------------------------------------ 146 template <int NUM_COMP, int P_1D> 147 inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 148 const CeedInt target_comp = n / (P_1D * P_1D); 149 const CeedInt target_node_x = n % P_1D; 150 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 151 152 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 153 r_v[target_comp] = value; 154 } 155 } 156 157 //------------------------------------------------------------------------------ 158 // L-vector -> E-vector, offsets provided 159 //------------------------------------------------------------------------------ 160 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 161 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 162 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 163 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 164 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 165 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 166 167 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 168 } 169 } 170 171 //------------------------------------------------------------------------------ 172 // L-vector -> E-vector, strided 173 //------------------------------------------------------------------------------ 174 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 175 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 176 CeedScalar *__restrict__ r_u) { 177 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 178 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 179 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 180 181 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 182 } 183 } 184 185 //------------------------------------------------------------------------------ 186 // E-vector -> L-vector, offsets provided 187 //------------------------------------------------------------------------------ 188 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 189 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 190 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 191 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 192 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 193 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 194 195 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 196 } 197 } 198 199 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 200 inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 201 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 202 CeedScalar *__restrict__ d_v) { 203 const CeedInt target_comp = n / (P_1D * P_1D); 204 const CeedInt target_node_x = n % P_1D; 205 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 206 207 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 208 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 209 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 210 211 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 212 } 213 } 214 215 //------------------------------------------------------------------------------ 216 // E-vector -> L-vector, strided 217 //------------------------------------------------------------------------------ 218 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 219 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 220 CeedScalar *__restrict__ d_v) { 221 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 222 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 223 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 224 225 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 226 } 227 } 228 229 //------------------------------------------------------------------------------ 230 // 3D 231 //------------------------------------------------------------------------------ 232 233 //------------------------------------------------------------------------------ 234 // Set E-vector value 235 //------------------------------------------------------------------------------ 236 template <int NUM_COMP, int P_1D> 237 inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 238 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 239 const CeedInt target_node_x = n % P_1D; 240 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 241 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 242 243 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 244 r_v[target_node_z + target_comp * P_1D] = value; 245 } 246 } 247 248 //------------------------------------------------------------------------------ 249 // L-vector -> E-vector, offsets provided 250 //------------------------------------------------------------------------------ 251 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 252 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 253 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 254 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 255 for (CeedInt z = 0; z < P_1D; z++) { 256 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 257 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 258 259 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp]; 260 } 261 } 262 } 263 264 //------------------------------------------------------------------------------ 265 // L-vector -> E-vector, strided 266 //------------------------------------------------------------------------------ 267 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 268 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 269 CeedScalar *__restrict__ r_u) { 270 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 271 for (CeedInt z = 0; z < P_1D; z++) { 272 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 273 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 274 275 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP]; 276 } 277 } 278 } 279 280 //------------------------------------------------------------------------------ 281 // E-vector -> Q-vector, offests provided 282 //------------------------------------------------------------------------------ 283 template <int NUM_COMP, int COMP_STRIDE, int Q_1D> 284 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 285 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, 286 CeedScalar *__restrict__ r_u) { 287 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 288 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 289 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; 290 291 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 292 } 293 } 294 295 //------------------------------------------------------------------------------ 296 // E-vector -> Q-vector, strided 297 //------------------------------------------------------------------------------ 298 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 299 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 300 CeedScalar *__restrict__ r_u) { 301 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 302 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 303 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 304 305 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 306 } 307 } 308 309 //------------------------------------------------------------------------------ 310 // E-vector -> L-vector, offsets provided 311 //------------------------------------------------------------------------------ 312 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 313 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 314 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 315 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 316 for (CeedInt z = 0; z < P_1D; z++) { 317 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 318 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 319 320 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]); 321 } 322 } 323 } 324 325 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 326 inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 327 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 328 CeedScalar *__restrict__ d_v) { 329 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 330 const CeedInt target_node_x = n % P_1D; 331 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 332 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 333 334 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 335 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D; 336 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 337 338 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]); 339 } 340 } 341 342 //------------------------------------------------------------------------------ 343 // E-vector -> L-vector, strided 344 //------------------------------------------------------------------------------ 345 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 346 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 347 CeedScalar *__restrict__ d_v) { 348 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 349 for (CeedInt z = 0; z < P_1D; z++) { 350 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 351 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 352 353 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D]; 354 } 355 } 356 } 357 358 //------------------------------------------------------------------------------ 359 // 3D collocated derivatives computation 360 //------------------------------------------------------------------------------ 361 template <int NUM_COMP, int Q_1D, int T_1D> 362 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 363 CeedScalar *__restrict__ r_V) { 364 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 365 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 366 __syncthreads(); 367 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D]; 368 __syncthreads(); 369 // X derivative 370 r_V[comp + 0 * NUM_COMP] = 0.0; 371 for (CeedInt i = 0; i < Q_1D; i++) { 372 r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 373 } 374 // Y derivative 375 r_V[comp + 1 * NUM_COMP] = 0.0; 376 for (CeedInt i = 0; i < Q_1D; i++) { 377 r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 378 } 379 // Z derivative 380 r_V[comp + 2 * NUM_COMP] = 0.0; 381 for (CeedInt i = 0; i < Q_1D; i++) { 382 r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; 383 } 384 } 385 } 386 } 387 388 //------------------------------------------------------------------------------ 389 // 3D collocated derivatives transpose 390 //------------------------------------------------------------------------------ 391 template <int NUM_COMP, int Q_1D, int T_1D> 392 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 393 CeedScalar *__restrict__ r_V) { 394 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 395 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 396 __syncthreads(); 397 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; 398 __syncthreads(); 399 // X derivative 400 for (CeedInt i = 0; i < Q_1D; i++) { 401 r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 402 } 403 // Y derivative 404 __syncthreads(); 405 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; 406 __syncthreads(); 407 for (CeedInt i = 0; i < Q_1D; i++) { 408 r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 409 } 410 // Z derivative 411 for (CeedInt i = 0; i < Q_1D; i++) { 412 r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP]; 413 } 414 } 415 } 416 } 417