1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA backend macro and type definitions for JiT source 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Load matrices for basis actions 14 //------------------------------------------------------------------------------ 15 template <int P, int Q> 16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 18 } 19 20 //------------------------------------------------------------------------------ 21 // AtPoints 22 //------------------------------------------------------------------------------ 23 24 //------------------------------------------------------------------------------ 25 // L-vector -> single point 26 //------------------------------------------------------------------------------ 27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 28 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 29 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 30 const CeedInt ind = indices[p + elem * NUM_PTS]; 31 32 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 33 r_u[comp] = d_u[ind + comp * COMP_STRIDE]; 34 } 35 } 36 37 //------------------------------------------------------------------------------ 38 // Single point -> L-vector 39 //------------------------------------------------------------------------------ 40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 41 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 42 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { 43 if (p < points_in_elem) { 44 const CeedInt ind = indices[p + elem * NUM_PTS]; 45 46 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 47 d_u[ind + comp * COMP_STRIDE] += r_u[comp]; 48 } 49 } 50 } 51 52 //------------------------------------------------------------------------------ 53 // 1D 54 //------------------------------------------------------------------------------ 55 56 //------------------------------------------------------------------------------ 57 // Set E-vector value 58 //------------------------------------------------------------------------------ 59 template <int NUM_COMP, int P_1D> 60 inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 61 const CeedInt target_comp = n / P_1D; 62 const CeedInt target_node = n % P_1D; 63 64 if (data.t_id_x == target_node) { 65 r_v[target_comp] = value; 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // L-vector -> E-vector, offsets provided 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 73 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 74 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 75 if (data.t_id_x < P_1D) { 76 const CeedInt node = data.t_id_x; 77 const CeedInt ind = indices[node + elem * P_1D]; 78 79 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 80 } 81 } 82 83 //------------------------------------------------------------------------------ 84 // L-vector -> E-vector, strided 85 //------------------------------------------------------------------------------ 86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 87 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 88 CeedScalar *__restrict__ r_u) { 89 if (data.t_id_x < P_1D) { 90 const CeedInt node = data.t_id_x; 91 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 92 93 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 94 } 95 } 96 97 //------------------------------------------------------------------------------ 98 // E-vector -> L-vector, offsets provided 99 //------------------------------------------------------------------------------ 100 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 101 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 102 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 103 if (data.t_id_x < P_1D) { 104 const CeedInt node = data.t_id_x; 105 const CeedInt ind = indices[node + elem * P_1D]; 106 107 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 108 } 109 } 110 111 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 112 inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 113 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 114 CeedScalar *__restrict__ d_v) { 115 const CeedInt target_comp = n / P_1D; 116 const CeedInt target_node = n % P_1D; 117 118 if (data.t_id_x == target_node) { 119 const CeedInt ind = indices[target_node + elem * P_1D]; 120 121 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 122 } 123 } 124 125 //------------------------------------------------------------------------------ 126 // E-vector -> L-vector, full assembly 127 //------------------------------------------------------------------------------ 128 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 129 inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 130 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 131 const CeedInt in_comp = in / P_1D; 132 const CeedInt in_node = in % P_1D; 133 const CeedInt e_vec_size = P_1D * NUM_COMP; 134 135 if (data.t_id_x < P_1D) { 136 const CeedInt out_node = data.t_id_x; 137 138 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 139 d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp]; 140 } 141 } 142 } 143 144 //------------------------------------------------------------------------------ 145 // E-vector -> L-vector, Qfunction assembly 146 //------------------------------------------------------------------------------ 147 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D> 148 inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, 149 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 150 if (data.t_id_x < Q_1D) { 151 const CeedInt ind = data.t_id_x + elem * Q_1D; 152 153 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { 154 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp]; 155 } 156 } 157 } 158 159 //------------------------------------------------------------------------------ 160 // E-vector -> L-vector, strided 161 //------------------------------------------------------------------------------ 162 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 163 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 164 CeedScalar *__restrict__ d_v) { 165 if (data.t_id_x < P_1D) { 166 const CeedInt node = data.t_id_x; 167 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 168 169 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 170 } 171 } 172 173 //------------------------------------------------------------------------------ 174 // 2D 175 //------------------------------------------------------------------------------ 176 177 //------------------------------------------------------------------------------ 178 // Set E-vector value 179 //------------------------------------------------------------------------------ 180 template <int NUM_COMP, int P_1D> 181 inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 182 const CeedInt target_comp = n / (P_1D * P_1D); 183 const CeedInt target_node_x = n % P_1D; 184 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 185 186 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 187 r_v[target_comp] = value; 188 } 189 } 190 191 //------------------------------------------------------------------------------ 192 // L-vector -> E-vector, offsets provided 193 //------------------------------------------------------------------------------ 194 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 195 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 196 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 197 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 198 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 199 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 200 201 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 202 } 203 } 204 205 //------------------------------------------------------------------------------ 206 // L-vector -> E-vector, strided 207 //------------------------------------------------------------------------------ 208 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 209 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 210 CeedScalar *__restrict__ r_u) { 211 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 212 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 213 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 214 215 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 216 } 217 } 218 219 //------------------------------------------------------------------------------ 220 // E-vector -> L-vector, offsets provided 221 //------------------------------------------------------------------------------ 222 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 223 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 224 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 225 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 226 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 227 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 228 229 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 230 } 231 } 232 233 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 234 inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 235 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 236 CeedScalar *__restrict__ d_v) { 237 const CeedInt target_comp = n / (P_1D * P_1D); 238 const CeedInt target_node_x = n % P_1D; 239 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 240 241 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 242 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 243 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 244 245 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]); 246 } 247 } 248 249 //------------------------------------------------------------------------------ 250 // E-vector -> L-vector, full assembly 251 //------------------------------------------------------------------------------ 252 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 253 inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 254 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 255 const CeedInt elem_size = P_1D * P_1D; 256 const CeedInt in_comp = in / elem_size; 257 const CeedInt in_node_x = in % P_1D; 258 const CeedInt in_node_y = (in % elem_size) / P_1D; 259 const CeedInt e_vec_size = elem_size * NUM_COMP; 260 261 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 262 const CeedInt in_node = in_node_x + in_node_y * P_1D; 263 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D; 264 265 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 266 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; 267 268 d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp]; 269 } 270 } 271 } 272 273 //------------------------------------------------------------------------------ 274 // E-vector -> L-vector, Qfunction assembly 275 //------------------------------------------------------------------------------ 276 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D> 277 inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, 278 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 279 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 280 const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D; 281 282 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { 283 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp]; 284 } 285 } 286 } 287 288 //------------------------------------------------------------------------------ 289 // E-vector -> L-vector, strided 290 //------------------------------------------------------------------------------ 291 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 292 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 293 CeedScalar *__restrict__ d_v) { 294 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 295 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 296 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 297 298 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 299 } 300 } 301 302 //------------------------------------------------------------------------------ 303 // 3D 304 //------------------------------------------------------------------------------ 305 306 //------------------------------------------------------------------------------ 307 // Set E-vector value 308 //------------------------------------------------------------------------------ 309 template <int NUM_COMP, int P_1D> 310 inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { 311 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 312 const CeedInt target_node_x = n % P_1D; 313 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 314 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 315 316 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 317 r_v[target_node_z + target_comp * P_1D] = value; 318 } 319 } 320 321 //------------------------------------------------------------------------------ 322 // L-vector -> E-vector, offsets provided 323 //------------------------------------------------------------------------------ 324 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 325 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 326 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 327 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 328 for (CeedInt z = 0; z < P_1D; z++) { 329 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 330 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 331 332 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp]; 333 } 334 } 335 } 336 337 //------------------------------------------------------------------------------ 338 // L-vector -> E-vector, strided 339 //------------------------------------------------------------------------------ 340 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 341 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 342 CeedScalar *__restrict__ r_u) { 343 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 344 for (CeedInt z = 0; z < P_1D; z++) { 345 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 346 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 347 348 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP]; 349 } 350 } 351 } 352 353 //------------------------------------------------------------------------------ 354 // E-vector -> Q-vector, offests provided 355 //------------------------------------------------------------------------------ 356 template <int NUM_COMP, int COMP_STRIDE, int Q_1D> 357 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 358 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, 359 CeedScalar *__restrict__ r_u) { 360 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 361 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 362 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; 363 364 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 365 } 366 } 367 368 //------------------------------------------------------------------------------ 369 // E-vector -> Q-vector, strided 370 //------------------------------------------------------------------------------ 371 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 372 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 373 CeedScalar *__restrict__ r_u) { 374 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 375 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 376 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 377 378 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 379 } 380 } 381 382 //------------------------------------------------------------------------------ 383 // E-vector -> L-vector, offsets provided 384 //------------------------------------------------------------------------------ 385 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 386 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 387 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 388 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 389 for (CeedInt z = 0; z < P_1D; z++) { 390 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 391 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 392 393 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]); 394 } 395 } 396 } 397 398 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 399 inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, 400 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, 401 CeedScalar *__restrict__ d_v) { 402 const CeedInt target_comp = n / (P_1D * P_1D * P_1D); 403 const CeedInt target_node_x = n % P_1D; 404 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; 405 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D); 406 407 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) { 408 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D; 409 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 410 411 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]); 412 } 413 } 414 415 //------------------------------------------------------------------------------ 416 // E-vector -> L-vector, full assembly 417 //------------------------------------------------------------------------------ 418 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 419 inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, 420 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 421 const CeedInt elem_size = P_1D * P_1D * P_1D; 422 const CeedInt in_comp = in / elem_size; 423 const CeedInt in_node_x = in % P_1D; 424 const CeedInt in_node_y = (in % (P_1D * P_1D)) / P_1D; 425 const CeedInt in_node_z = (in % elem_size) / (P_1D * P_1D); 426 const CeedInt e_vec_size = elem_size * NUM_COMP; 427 428 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 429 const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D; 430 for (CeedInt z = 0; z < P_1D; z++) { 431 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 432 433 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 434 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node; 435 436 d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D]; 437 } 438 } 439 } 440 } 441 442 //------------------------------------------------------------------------------ 443 // E-vector -> L-vector, Qfunction assembly 444 //------------------------------------------------------------------------------ 445 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D> 446 inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, 447 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 448 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 449 for (CeedInt z = 0; z < Q_1D; z++) { 450 const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D; 451 452 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) { 453 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D]; 454 } 455 } 456 } 457 } 458 459 //------------------------------------------------------------------------------ 460 // E-vector -> L-vector, strided 461 //------------------------------------------------------------------------------ 462 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 463 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 464 CeedScalar *__restrict__ d_v) { 465 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 466 for (CeedInt z = 0; z < P_1D; z++) { 467 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 468 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 469 470 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D]; 471 } 472 } 473 } 474 475 //------------------------------------------------------------------------------ 476 // 3D collocated derivatives computation 477 //------------------------------------------------------------------------------ 478 template <int NUM_COMP, int Q_1D, int T_1D> 479 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 480 CeedScalar *__restrict__ r_V) { 481 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 482 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 483 __syncthreads(); 484 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D]; 485 __syncthreads(); 486 // X derivative 487 r_V[comp + 0 * NUM_COMP] = 0.0; 488 for (CeedInt i = 0; i < Q_1D; i++) { 489 r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 490 } 491 // Y derivative 492 r_V[comp + 1 * NUM_COMP] = 0.0; 493 for (CeedInt i = 0; i < Q_1D; i++) { 494 r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 495 } 496 // Z derivative 497 r_V[comp + 2 * NUM_COMP] = 0.0; 498 for (CeedInt i = 0; i < Q_1D; i++) { 499 r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; 500 } 501 } 502 } 503 } 504 505 //------------------------------------------------------------------------------ 506 // 3D collocated derivatives transpose 507 //------------------------------------------------------------------------------ 508 template <int NUM_COMP, int Q_1D, int T_1D> 509 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 510 CeedScalar *__restrict__ r_V) { 511 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 512 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 513 __syncthreads(); 514 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; 515 __syncthreads(); 516 // X derivative 517 for (CeedInt i = 0; i < Q_1D; i++) { 518 r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 519 } 520 // Y derivative 521 __syncthreads(); 522 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; 523 __syncthreads(); 524 for (CeedInt i = 0; i < Q_1D; i++) { 525 r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 526 } 527 // Z derivative 528 for (CeedInt i = 0; i < Q_1D; i++) { 529 r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP]; 530 } 531 } 532 } 533 } 534