1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-ref.h" 20 21 //------------------------------------------------------------------------------ 22 // Compile restriction kernels 23 //------------------------------------------------------------------------------ 24 static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr) { 25 Ceed ceed; 26 bool is_deterministic; 27 char *restriction_kernel_source; 28 const char *restriction_kernel_path; 29 CeedInt num_elem, num_comp, elem_size, comp_stride; 30 CeedRestrictionType rstr_type; 31 CeedElemRestriction_Cuda *impl; 32 33 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 34 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 35 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 36 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 37 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 38 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); 39 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 40 is_deterministic = impl->d_l_vec_indices != NULL; 41 42 // Compile CUDA kernels 43 switch (rstr_type) { 44 case CEED_RESTRICTION_STRIDED: { 45 bool has_backend_strides; 46 CeedInt strides[3] = {1, num_elem * elem_size, elem_size}; 47 48 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 49 if (!has_backend_strides) { 50 CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); 51 } 52 53 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-strided.h", &restriction_kernel_path)); 54 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 55 CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 56 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 57 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 58 "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", 59 strides[2])); 60 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose)); 61 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose)); 62 } break; 63 case CEED_RESTRICTION_STANDARD: { 64 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &restriction_kernel_path)); 65 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 66 CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 67 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 68 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 69 "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, 70 "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); 71 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose)); 72 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose)); 73 } break; 74 case CEED_RESTRICTION_ORIENTED: { 75 const char *offset_kernel_path; 76 char **file_paths = NULL; 77 CeedInt num_file_paths = 0; 78 79 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-oriented.h", &restriction_kernel_path)); 80 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 81 CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); 82 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path)); 83 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); 84 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 85 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 86 "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, 87 "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); 88 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedNoTranspose", &impl->ApplyNoTranspose)); 89 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose)); 90 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose)); 91 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose)); 92 // Cleanup 93 CeedCallBackend(CeedFree(&offset_kernel_path)); 94 for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i])); 95 CeedCall(CeedFree(&file_paths)); 96 } break; 97 case CEED_RESTRICTION_CURL_ORIENTED: { 98 const char *offset_kernel_path; 99 char **file_paths = NULL; 100 CeedInt num_file_paths = 0; 101 102 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h", &restriction_kernel_path)); 103 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 104 CeedCallBackend(CeedLoadSourceAndInitializeBuffer(ceed, restriction_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); 105 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path)); 106 CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &num_file_paths, &file_paths, &restriction_kernel_source)); 107 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 108 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 109 "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, 110 "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); 111 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedNoTranspose", &impl->ApplyNoTranspose)); 112 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->ApplyUnsignedNoTranspose)); 113 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnorientedNoTranspose)); 114 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose)); 115 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose)); 116 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose)); 117 // Cleanup 118 CeedCallBackend(CeedFree(&offset_kernel_path)); 119 for (CeedInt i = 0; i < num_file_paths; i++) CeedCall(CeedFree(&file_paths[i])); 120 CeedCall(CeedFree(&file_paths)); 121 } break; 122 case CEED_RESTRICTION_POINTS: { 123 // LCOV_EXCL_START 124 return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); 125 // LCOV_EXCL_STOP 126 } break; 127 } 128 CeedCallBackend(CeedFree(&restriction_kernel_path)); 129 CeedCallBackend(CeedFree(&restriction_kernel_source)); 130 return CEED_ERROR_SUCCESS; 131 } 132 133 //------------------------------------------------------------------------------ 134 // Core apply restriction code 135 //------------------------------------------------------------------------------ 136 static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients, 137 CeedVector u, CeedVector v, CeedRequest *request) { 138 Ceed ceed; 139 CeedRestrictionType rstr_type; 140 const CeedScalar *d_u; 141 CeedScalar *d_v; 142 CeedElemRestriction_Cuda *impl; 143 144 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 145 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 146 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 147 148 // Assemble kernel if needed 149 if (!impl->module) { 150 CeedCallBackend(CeedElemRestrictionSetupCompile_Cuda(rstr)); 151 } 152 153 // Get vectors 154 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 155 if (t_mode == CEED_TRANSPOSE) { 156 // Sum into for transpose mode, e-vec to l-vec 157 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 158 } else { 159 // Overwrite for notranspose mode, l-vec to e-vec 160 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 161 } 162 163 // Restrict 164 if (t_mode == CEED_NOTRANSPOSE) { 165 // L-vector -> E-vector 166 CeedInt elem_size; 167 168 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 169 const CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; 170 const CeedInt grid = CeedDivUpInt(impl->num_nodes, block_size); 171 172 switch (rstr_type) { 173 case CEED_RESTRICTION_STRIDED: { 174 void *args[] = {&d_u, &d_v}; 175 176 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); 177 } break; 178 case CEED_RESTRICTION_STANDARD: { 179 void *args[] = {&impl->d_offsets, &d_u, &d_v}; 180 181 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); 182 } break; 183 case CEED_RESTRICTION_ORIENTED: { 184 if (use_signs) { 185 void *args[] = {&impl->d_offsets, &impl->d_orients, &d_u, &d_v}; 186 187 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); 188 } else { 189 void *args[] = {&impl->d_offsets, &d_u, &d_v}; 190 191 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args)); 192 } 193 } break; 194 case CEED_RESTRICTION_CURL_ORIENTED: { 195 if (use_signs && use_orients) { 196 void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v}; 197 198 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); 199 } else if (use_orients) { 200 void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v}; 201 202 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args)); 203 } else { 204 void *args[] = {&impl->d_offsets, &d_u, &d_v}; 205 206 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args)); 207 } 208 } break; 209 case CEED_RESTRICTION_POINTS: { 210 // LCOV_EXCL_START 211 return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); 212 // LCOV_EXCL_STOP 213 } break; 214 } 215 } else { 216 // E-vector -> L-vector 217 const bool is_deterministic = impl->d_l_vec_indices != NULL; 218 const CeedInt block_size = 32; 219 const CeedInt grid = CeedDivUpInt(impl->num_nodes, block_size); 220 221 switch (rstr_type) { 222 case CEED_RESTRICTION_STRIDED: { 223 void *args[] = {&d_u, &d_v}; 224 225 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 226 } break; 227 case CEED_RESTRICTION_STANDARD: { 228 if (!is_deterministic) { 229 void *args[] = {&impl->d_offsets, &d_u, &d_v}; 230 231 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 232 } else { 233 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 234 235 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 236 } 237 } break; 238 case CEED_RESTRICTION_ORIENTED: { 239 if (use_signs) { 240 if (!is_deterministic) { 241 void *args[] = {&impl->d_offsets, &impl->d_orients, &d_u, &d_v}; 242 243 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 244 } else { 245 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_orients, &d_u, &d_v}; 246 247 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 248 } 249 } else { 250 if (!is_deterministic) { 251 void *args[] = {&impl->d_offsets, &d_u, &d_v}; 252 253 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); 254 } else { 255 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 256 257 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); 258 } 259 } 260 } break; 261 case CEED_RESTRICTION_CURL_ORIENTED: { 262 if (use_signs && use_orients) { 263 if (!is_deterministic) { 264 void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v}; 265 266 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 267 } else { 268 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; 269 270 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); 271 } 272 } else if (use_orients) { 273 if (!is_deterministic) { 274 void *args[] = {&impl->d_offsets, &impl->d_curl_orients, &d_u, &d_v}; 275 276 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); 277 } else { 278 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; 279 280 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); 281 } 282 } else { 283 if (!is_deterministic) { 284 void *args[] = {&impl->d_offsets, &d_u, &d_v}; 285 286 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args)); 287 } else { 288 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 289 290 CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args)); 291 } 292 } 293 } break; 294 case CEED_RESTRICTION_POINTS: { 295 // LCOV_EXCL_START 296 return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); 297 // LCOV_EXCL_STOP 298 } break; 299 } 300 } 301 302 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; 303 304 // Restore arrays 305 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 306 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 307 return CEED_ERROR_SUCCESS; 308 } 309 310 //------------------------------------------------------------------------------ 311 // Apply restriction 312 //------------------------------------------------------------------------------ 313 static int CeedElemRestrictionApply_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { 314 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, true, true, u, v, request); 315 } 316 317 //------------------------------------------------------------------------------ 318 // Apply unsigned restriction 319 //------------------------------------------------------------------------------ 320 static int CeedElemRestrictionApplyUnsigned_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 321 CeedRequest *request) { 322 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, true, u, v, request); 323 } 324 325 //------------------------------------------------------------------------------ 326 // Apply unoriented restriction 327 //------------------------------------------------------------------------------ 328 static int CeedElemRestrictionApplyUnoriented_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 329 CeedRequest *request) { 330 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, false, u, v, request); 331 } 332 333 //------------------------------------------------------------------------------ 334 // Get offsets 335 //------------------------------------------------------------------------------ 336 static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { 337 CeedElemRestriction_Cuda *impl; 338 339 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 340 switch (mem_type) { 341 case CEED_MEM_HOST: 342 *offsets = impl->h_offsets; 343 break; 344 case CEED_MEM_DEVICE: 345 *offsets = impl->d_offsets; 346 break; 347 } 348 return CEED_ERROR_SUCCESS; 349 } 350 351 //------------------------------------------------------------------------------ 352 // Get orientations 353 //------------------------------------------------------------------------------ 354 static int CeedElemRestrictionGetOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) { 355 CeedElemRestriction_Cuda *impl; 356 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 357 358 switch (mem_type) { 359 case CEED_MEM_HOST: 360 *orients = impl->h_orients; 361 break; 362 case CEED_MEM_DEVICE: 363 *orients = impl->d_orients; 364 break; 365 } 366 return CEED_ERROR_SUCCESS; 367 } 368 369 //------------------------------------------------------------------------------ 370 // Get curl-conforming orientations 371 //------------------------------------------------------------------------------ 372 static int CeedElemRestrictionGetCurlOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) { 373 CeedElemRestriction_Cuda *impl; 374 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 375 376 switch (mem_type) { 377 case CEED_MEM_HOST: 378 *curl_orients = impl->h_curl_orients; 379 break; 380 case CEED_MEM_DEVICE: 381 *curl_orients = impl->d_curl_orients; 382 break; 383 } 384 return CEED_ERROR_SUCCESS; 385 } 386 387 //------------------------------------------------------------------------------ 388 // Destroy restriction 389 //------------------------------------------------------------------------------ 390 static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) { 391 Ceed ceed; 392 CeedElemRestriction_Cuda *impl; 393 394 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 395 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 396 if (impl->module) { 397 CeedCallCuda(ceed, cuModuleUnload(impl->module)); 398 } 399 CeedCallBackend(CeedFree(&impl->h_offsets_owned)); 400 CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_offsets_owned)); 401 CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_t_offsets)); 402 CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_t_indices)); 403 CeedCallCuda(ceed, cudaFree((CeedInt *)impl->d_l_vec_indices)); 404 CeedCallBackend(CeedFree(&impl->h_orients_owned)); 405 CeedCallCuda(ceed, cudaFree((bool *)impl->d_orients_owned)); 406 CeedCallBackend(CeedFree(&impl->h_curl_orients_owned)); 407 CeedCallCuda(ceed, cudaFree((CeedInt8 *)impl->d_curl_orients_owned)); 408 CeedCallBackend(CeedFree(&impl)); 409 return CEED_ERROR_SUCCESS; 410 } 411 412 //------------------------------------------------------------------------------ 413 // Create transpose offsets and indices 414 //------------------------------------------------------------------------------ 415 static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt *indices) { 416 Ceed ceed; 417 bool *is_node; 418 CeedSize l_size; 419 CeedInt num_elem, elem_size, num_comp, num_nodes = 0; 420 CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; 421 CeedElemRestriction_Cuda *impl; 422 423 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 424 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 425 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 426 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 427 CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); 428 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 429 const CeedInt size_indices = num_elem * elem_size; 430 431 // Count num_nodes 432 CeedCallBackend(CeedCalloc(l_size, &is_node)); 433 434 for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; 435 for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; 436 impl->num_nodes = num_nodes; 437 438 // L-vector offsets array 439 CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); 440 CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); 441 for (CeedInt i = 0, j = 0; i < l_size; i++) { 442 if (is_node[i]) { 443 l_vec_indices[j] = i; 444 ind_to_offset[i] = j++; 445 } 446 } 447 CeedCallBackend(CeedFree(&is_node)); 448 449 // Compute transpose offsets and indices 450 const CeedInt size_offsets = num_nodes + 1; 451 452 CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); 453 CeedCallBackend(CeedMalloc(size_indices, &t_indices)); 454 // Count node multiplicity 455 for (CeedInt e = 0; e < num_elem; ++e) { 456 for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1]; 457 } 458 // Convert to running sum 459 for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1]; 460 // List all E-vec indices associated with L-vec node 461 for (CeedInt e = 0; e < num_elem; ++e) { 462 for (CeedInt i = 0; i < elem_size; ++i) { 463 const CeedInt lid = elem_size * e + i; 464 const CeedInt gid = indices[lid]; 465 466 t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 467 } 468 } 469 // Reset running sum 470 for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1]; 471 t_offsets[0] = 0; 472 473 // Copy data to device 474 // -- L-vector indices 475 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt))); 476 CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), cudaMemcpyHostToDevice)); 477 // -- Transpose offsets 478 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt))); 479 CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), cudaMemcpyHostToDevice)); 480 // -- Transpose indices 481 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt))); 482 CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), cudaMemcpyHostToDevice)); 483 484 // Cleanup 485 CeedCallBackend(CeedFree(&ind_to_offset)); 486 CeedCallBackend(CeedFree(&l_vec_indices)); 487 CeedCallBackend(CeedFree(&t_offsets)); 488 CeedCallBackend(CeedFree(&t_indices)); 489 return CEED_ERROR_SUCCESS; 490 } 491 492 //------------------------------------------------------------------------------ 493 // Create restriction 494 //------------------------------------------------------------------------------ 495 int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, const bool *orients, 496 const CeedInt8 *curl_orients, CeedElemRestriction rstr) { 497 Ceed ceed, ceed_parent; 498 bool is_deterministic; 499 CeedInt num_elem, elem_size; 500 CeedRestrictionType rstr_type; 501 CeedElemRestriction_Cuda *impl; 502 503 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 504 CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); 505 CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); 506 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 507 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 508 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 509 const CeedInt size = num_elem * elem_size; 510 511 CeedCallBackend(CeedCalloc(1, &impl)); 512 impl->num_nodes = size; 513 CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); 514 515 // Set layouts 516 { 517 bool has_backend_strides; 518 CeedInt layout[3] = {1, size, elem_size}; 519 520 CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); 521 if (rstr_type == CEED_RESTRICTION_STRIDED) { 522 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 523 if (has_backend_strides) { 524 CeedCallBackend(CeedElemRestrictionSetLLayout(rstr, layout)); 525 } 526 } 527 } 528 529 // Set up device offset/orientation arrays 530 if (rstr_type != CEED_RESTRICTION_STRIDED) { 531 switch (mem_type) { 532 case CEED_MEM_HOST: { 533 CeedCallBackend(CeedSetHostCeedIntArray(offsets, copy_mode, size, &impl->h_offsets_owned, &impl->h_offsets_borrowed, &impl->h_offsets)); 534 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_offsets_owned, size * sizeof(CeedInt))); 535 CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->d_offsets_owned, impl->h_offsets, size * sizeof(CeedInt), cudaMemcpyHostToDevice)); 536 impl->d_offsets = (CeedInt *)impl->d_offsets_owned; 537 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, offsets)); 538 } break; 539 case CEED_MEM_DEVICE: { 540 CeedCallBackend(CeedSetDeviceCeedIntArray_Cuda(ceed, offsets, copy_mode, size, &impl->d_offsets_owned, &impl->d_offsets_borrowed, 541 (const CeedInt **)&impl->d_offsets)); 542 CeedCallBackend(CeedMalloc(size, &impl->h_offsets_owned)); 543 CeedCallCuda(ceed, cudaMemcpy((CeedInt *)impl->h_offsets_owned, impl->d_offsets, size * sizeof(CeedInt), cudaMemcpyDeviceToHost)); 544 impl->h_offsets = impl->h_offsets_owned; 545 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, offsets)); 546 } break; 547 } 548 549 // Orientation data 550 if (rstr_type == CEED_RESTRICTION_ORIENTED) { 551 switch (mem_type) { 552 case CEED_MEM_HOST: { 553 CeedCallBackend(CeedSetHostBoolArray(orients, copy_mode, size, &impl->h_orients_owned, &impl->h_orients_borrowed, &impl->h_orients)); 554 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_orients_owned, size * sizeof(bool))); 555 CeedCallCuda(ceed, cudaMemcpy((bool *)impl->d_orients_owned, impl->h_orients, size * sizeof(bool), cudaMemcpyHostToDevice)); 556 impl->d_orients = impl->d_orients_owned; 557 } break; 558 case CEED_MEM_DEVICE: { 559 CeedCallBackend(CeedSetDeviceBoolArray_Cuda(ceed, orients, copy_mode, size, &impl->d_orients_owned, &impl->d_orients_borrowed, 560 (const bool **)&impl->d_orients)); 561 CeedCallBackend(CeedMalloc(size, &impl->h_orients_owned)); 562 CeedCallCuda(ceed, cudaMemcpy((bool *)impl->h_orients_owned, impl->d_orients, size * sizeof(bool), cudaMemcpyDeviceToHost)); 563 impl->h_orients = impl->h_orients_owned; 564 } break; 565 } 566 } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) { 567 switch (mem_type) { 568 case CEED_MEM_HOST: { 569 CeedCallBackend(CeedSetHostCeedInt8Array(curl_orients, copy_mode, 3 * size, &impl->h_curl_orients_owned, &impl->h_curl_orients_borrowed, 570 &impl->h_curl_orients)); 571 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_curl_orients_owned, 3 * size * sizeof(CeedInt8))); 572 CeedCallCuda(ceed, 573 cudaMemcpy((CeedInt8 *)impl->d_curl_orients_owned, impl->h_curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyHostToDevice)); 574 impl->d_curl_orients = impl->d_curl_orients_owned; 575 } break; 576 case CEED_MEM_DEVICE: { 577 CeedCallBackend(CeedSetDeviceCeedInt8Array_Cuda(ceed, curl_orients, copy_mode, 3 * size, &impl->d_curl_orients_owned, 578 &impl->d_curl_orients_borrowed, (const CeedInt8 **)&impl->d_curl_orients)); 579 CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_owned)); 580 CeedCallCuda(ceed, 581 cudaMemcpy((CeedInt8 *)impl->h_curl_orients_owned, impl->d_curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyDeviceToHost)); 582 impl->h_curl_orients = impl->h_curl_orients_owned; 583 } break; 584 } 585 } 586 } 587 588 // Register backend functions 589 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Cuda)); 590 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Cuda)); 591 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Cuda)); 592 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); 593 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Cuda)); 594 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Cuda)); 595 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda)); 596 return CEED_ERROR_SUCCESS; 597 } 598 599 //------------------------------------------------------------------------------ 600