1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-ref.h" 20 21 //------------------------------------------------------------------------------ 22 // Core apply restriction code 23 //------------------------------------------------------------------------------ 24 static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients, 25 CeedVector u, CeedVector v, CeedRequest *request) { 26 Ceed ceed; 27 CeedInt num_elem, elem_size; 28 CeedRestrictionType rstr_type; 29 const CeedScalar *d_u; 30 CeedScalar *d_v; 31 CeedElemRestriction_Cuda *impl; 32 CUfunction kernel; 33 34 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 35 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 36 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 37 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 38 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 39 const CeedInt num_nodes = impl->num_nodes; 40 41 // Get vectors 42 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 43 if (t_mode == CEED_TRANSPOSE) { 44 // Sum into for transpose mode, e-vec to l-vec 45 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 46 } else { 47 // Overwrite for notranspose mode, l-vec to e-vec 48 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 49 } 50 51 // Restrict 52 if (t_mode == CEED_NOTRANSPOSE) { 53 // L-vector -> E-vector 54 const CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; 55 const CeedInt grid = CeedDivUpInt(num_nodes, block_size); 56 57 switch (rstr_type) { 58 case CEED_RESTRICTION_STRIDED: { 59 kernel = impl->StridedNoTranspose; 60 void *args[] = {&num_elem, &d_u, &d_v}; 61 62 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 63 } break; 64 case CEED_RESTRICTION_STANDARD: { 65 kernel = impl->OffsetNoTranspose; 66 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 67 68 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 69 } break; 70 case CEED_RESTRICTION_ORIENTED: { 71 if (use_signs) { 72 kernel = impl->OrientedNoTranspose; 73 void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; 74 75 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 76 } else { 77 kernel = impl->OffsetNoTranspose; 78 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 79 80 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 81 } 82 } break; 83 case CEED_RESTRICTION_CURL_ORIENTED: { 84 if (use_signs && use_orients) { 85 kernel = impl->CurlOrientedNoTranspose; 86 void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; 87 88 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 89 } else if (use_orients) { 90 kernel = impl->CurlOrientedUnsignedNoTranspose; 91 void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; 92 93 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 94 } else { 95 kernel = impl->OffsetNoTranspose; 96 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 97 98 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 99 } 100 } break; 101 case CEED_RESTRICTION_POINTS: { 102 // LCOV_EXCL_START 103 return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); 104 // LCOV_EXCL_STOP 105 } break; 106 } 107 } else { 108 // E-vector -> L-vector 109 const CeedInt block_size = 32; 110 const CeedInt grid = CeedDivUpInt(num_nodes, block_size); 111 112 switch (rstr_type) { 113 case CEED_RESTRICTION_STRIDED: { 114 kernel = impl->StridedTranspose; 115 void *args[] = {&num_elem, &d_u, &d_v}; 116 117 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 118 } break; 119 case CEED_RESTRICTION_STANDARD: { 120 if (impl->OffsetTranspose) { 121 kernel = impl->OffsetTranspose; 122 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 123 124 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 125 } else { 126 kernel = impl->OffsetTransposeDet; 127 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 128 129 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 130 } 131 } break; 132 case CEED_RESTRICTION_ORIENTED: { 133 if (use_signs) { 134 kernel = impl->OrientedTranspose; 135 void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; 136 137 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 138 } else { 139 if (impl->OffsetTranspose) { 140 kernel = impl->OffsetTranspose; 141 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 142 143 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 144 } else { 145 kernel = impl->OffsetTransposeDet; 146 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 147 148 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 149 } 150 } 151 } break; 152 case CEED_RESTRICTION_CURL_ORIENTED: { 153 if (use_signs && use_orients) { 154 kernel = impl->CurlOrientedTranspose; 155 void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; 156 157 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 158 } else if (use_orients) { 159 kernel = impl->CurlOrientedUnsignedTranspose; 160 void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; 161 162 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 163 } else { 164 if (impl->OffsetTranspose) { 165 kernel = impl->OffsetTranspose; 166 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 167 168 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 169 } else { 170 kernel = impl->OffsetTransposeDet; 171 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 172 173 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 174 } 175 } 176 } break; 177 case CEED_RESTRICTION_POINTS: { 178 // LCOV_EXCL_START 179 return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); 180 // LCOV_EXCL_STOP 181 } break; 182 } 183 } 184 185 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; 186 187 // Restore arrays 188 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 189 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 190 return CEED_ERROR_SUCCESS; 191 } 192 193 //------------------------------------------------------------------------------ 194 // Apply restriction 195 //------------------------------------------------------------------------------ 196 static int CeedElemRestrictionApply_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { 197 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, true, true, u, v, request); 198 } 199 200 //------------------------------------------------------------------------------ 201 // Apply unsigned restriction 202 //------------------------------------------------------------------------------ 203 static int CeedElemRestrictionApplyUnsigned_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 204 CeedRequest *request) { 205 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, true, u, v, request); 206 } 207 208 //------------------------------------------------------------------------------ 209 // Apply unoriented restriction 210 //------------------------------------------------------------------------------ 211 static int CeedElemRestrictionApplyUnoriented_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 212 CeedRequest *request) { 213 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, false, u, v, request); 214 } 215 216 //------------------------------------------------------------------------------ 217 // Get offsets 218 //------------------------------------------------------------------------------ 219 static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { 220 CeedElemRestriction_Cuda *impl; 221 222 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 223 switch (mem_type) { 224 case CEED_MEM_HOST: 225 *offsets = impl->h_ind; 226 break; 227 case CEED_MEM_DEVICE: 228 *offsets = impl->d_ind; 229 break; 230 } 231 return CEED_ERROR_SUCCESS; 232 } 233 234 //------------------------------------------------------------------------------ 235 // Get orientations 236 //------------------------------------------------------------------------------ 237 static int CeedElemRestrictionGetOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) { 238 CeedElemRestriction_Cuda *impl; 239 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 240 241 switch (mem_type) { 242 case CEED_MEM_HOST: 243 *orients = impl->h_orients; 244 break; 245 case CEED_MEM_DEVICE: 246 *orients = impl->d_orients; 247 break; 248 } 249 return CEED_ERROR_SUCCESS; 250 } 251 252 //------------------------------------------------------------------------------ 253 // Get curl-conforming orientations 254 //------------------------------------------------------------------------------ 255 static int CeedElemRestrictionGetCurlOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) { 256 CeedElemRestriction_Cuda *impl; 257 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 258 259 switch (mem_type) { 260 case CEED_MEM_HOST: 261 *curl_orients = impl->h_curl_orients; 262 break; 263 case CEED_MEM_DEVICE: 264 *curl_orients = impl->d_curl_orients; 265 break; 266 } 267 return CEED_ERROR_SUCCESS; 268 } 269 270 //------------------------------------------------------------------------------ 271 // Destroy restriction 272 //------------------------------------------------------------------------------ 273 static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) { 274 Ceed ceed; 275 CeedElemRestriction_Cuda *impl; 276 277 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 278 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 279 CeedCallCuda(ceed, cuModuleUnload(impl->module)); 280 CeedCallBackend(CeedFree(&impl->h_ind_allocated)); 281 CeedCallCuda(ceed, cudaFree(impl->d_ind_allocated)); 282 CeedCallCuda(ceed, cudaFree(impl->d_t_offsets)); 283 CeedCallCuda(ceed, cudaFree(impl->d_t_indices)); 284 CeedCallCuda(ceed, cudaFree(impl->d_l_vec_indices)); 285 CeedCallBackend(CeedFree(&impl->h_orients_allocated)); 286 CeedCallCuda(ceed, cudaFree(impl->d_orients_allocated)); 287 CeedCallBackend(CeedFree(&impl->h_curl_orients_allocated)); 288 CeedCallCuda(ceed, cudaFree(impl->d_curl_orients_allocated)); 289 CeedCallBackend(CeedFree(&impl)); 290 return CEED_ERROR_SUCCESS; 291 } 292 293 //------------------------------------------------------------------------------ 294 // Create transpose offsets and indices 295 //------------------------------------------------------------------------------ 296 static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt *indices) { 297 Ceed ceed; 298 bool *is_node; 299 CeedSize l_size; 300 CeedInt num_elem, elem_size, num_comp, num_nodes = 0; 301 CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; 302 CeedElemRestriction_Cuda *impl; 303 304 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 305 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 306 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 307 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 308 CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); 309 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 310 const CeedInt size_indices = num_elem * elem_size; 311 312 // Count num_nodes 313 CeedCallBackend(CeedCalloc(l_size, &is_node)); 314 315 for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; 316 for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; 317 impl->num_nodes = num_nodes; 318 319 // L-vector offsets array 320 CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); 321 CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); 322 for (CeedInt i = 0, j = 0; i < l_size; i++) { 323 if (is_node[i]) { 324 l_vec_indices[j] = i; 325 ind_to_offset[i] = j++; 326 } 327 } 328 CeedCallBackend(CeedFree(&is_node)); 329 330 // Compute transpose offsets and indices 331 const CeedInt size_offsets = num_nodes + 1; 332 333 CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); 334 CeedCallBackend(CeedMalloc(size_indices, &t_indices)); 335 // Count node multiplicity 336 for (CeedInt e = 0; e < num_elem; ++e) { 337 for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1]; 338 } 339 // Convert to running sum 340 for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1]; 341 // List all E-vec indices associated with L-vec node 342 for (CeedInt e = 0; e < num_elem; ++e) { 343 for (CeedInt i = 0; i < elem_size; ++i) { 344 const CeedInt lid = elem_size * e + i; 345 const CeedInt gid = indices[lid]; 346 347 t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 348 } 349 } 350 // Reset running sum 351 for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1]; 352 t_offsets[0] = 0; 353 354 // Copy data to device 355 // -- L-vector indices 356 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt))); 357 CeedCallCuda(ceed, cudaMemcpy(impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), cudaMemcpyHostToDevice)); 358 // -- Transpose offsets 359 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt))); 360 CeedCallCuda(ceed, cudaMemcpy(impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), cudaMemcpyHostToDevice)); 361 // -- Transpose indices 362 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt))); 363 CeedCallCuda(ceed, cudaMemcpy(impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), cudaMemcpyHostToDevice)); 364 365 // Cleanup 366 CeedCallBackend(CeedFree(&ind_to_offset)); 367 CeedCallBackend(CeedFree(&l_vec_indices)); 368 CeedCallBackend(CeedFree(&t_offsets)); 369 CeedCallBackend(CeedFree(&t_indices)); 370 return CEED_ERROR_SUCCESS; 371 } 372 373 //------------------------------------------------------------------------------ 374 // Create restriction 375 //------------------------------------------------------------------------------ 376 int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, 377 const CeedInt8 *curl_orients, CeedElemRestriction rstr) { 378 Ceed ceed, ceed_parent; 379 bool is_deterministic; 380 CeedInt num_elem, num_comp, elem_size, comp_stride = 1; 381 CeedRestrictionType rstr_type; 382 char *restriction_kernel_path, *restriction_kernel_source; 383 CeedElemRestriction_Cuda *impl; 384 385 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 386 CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); 387 CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); 388 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 389 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 390 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 391 const CeedInt size = num_elem * elem_size; 392 CeedInt strides[3] = {1, size, elem_size}; 393 CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; 394 395 // Stride data 396 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 397 if (rstr_type == CEED_RESTRICTION_STRIDED) { 398 bool has_backend_strides; 399 400 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 401 if (!has_backend_strides) { 402 CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); 403 } 404 } else { 405 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); 406 } 407 408 CeedCallBackend(CeedCalloc(1, &impl)); 409 impl->num_nodes = size; 410 impl->h_ind = NULL; 411 impl->h_ind_allocated = NULL; 412 impl->d_ind = NULL; 413 impl->d_ind_allocated = NULL; 414 impl->d_t_indices = NULL; 415 impl->d_t_offsets = NULL; 416 impl->h_orients = NULL; 417 impl->h_orients_allocated = NULL; 418 impl->d_orients = NULL; 419 impl->d_orients_allocated = NULL; 420 impl->h_curl_orients = NULL; 421 impl->h_curl_orients_allocated = NULL; 422 impl->d_curl_orients = NULL; 423 impl->d_curl_orients_allocated = NULL; 424 CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); 425 CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); 426 427 // Set up device offset/orientation arrays 428 if (rstr_type != CEED_RESTRICTION_STRIDED) { 429 switch (mem_type) { 430 case CEED_MEM_HOST: { 431 switch (copy_mode) { 432 case CEED_OWN_POINTER: 433 impl->h_ind_allocated = (CeedInt *)indices; 434 impl->h_ind = (CeedInt *)indices; 435 break; 436 case CEED_USE_POINTER: 437 impl->h_ind = (CeedInt *)indices; 438 break; 439 case CEED_COPY_VALUES: 440 CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); 441 memcpy(impl->h_ind_allocated, indices, size * sizeof(CeedInt)); 442 impl->h_ind = impl->h_ind_allocated; 443 break; 444 } 445 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 446 impl->d_ind_allocated = impl->d_ind; // We own the device memory 447 CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyHostToDevice)); 448 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, indices)); 449 } break; 450 case CEED_MEM_DEVICE: { 451 switch (copy_mode) { 452 case CEED_COPY_VALUES: 453 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 454 impl->d_ind_allocated = impl->d_ind; // We own the device memory 455 CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyDeviceToDevice)); 456 break; 457 case CEED_OWN_POINTER: 458 impl->d_ind = (CeedInt *)indices; 459 impl->d_ind_allocated = impl->d_ind; 460 break; 461 case CEED_USE_POINTER: 462 impl->d_ind = (CeedInt *)indices; 463 break; 464 } 465 CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); 466 CeedCallCuda(ceed, cudaMemcpy(impl->h_ind_allocated, impl->d_ind, size * sizeof(CeedInt), cudaMemcpyDeviceToHost)); 467 impl->h_ind = impl->h_ind_allocated; 468 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, indices)); 469 } break; 470 } 471 472 // Orientation data 473 if (rstr_type == CEED_RESTRICTION_ORIENTED) { 474 switch (mem_type) { 475 case CEED_MEM_HOST: { 476 switch (copy_mode) { 477 case CEED_OWN_POINTER: 478 impl->h_orients_allocated = (bool *)orients; 479 impl->h_orients = (bool *)orients; 480 break; 481 case CEED_USE_POINTER: 482 impl->h_orients = (bool *)orients; 483 break; 484 case CEED_COPY_VALUES: 485 CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); 486 memcpy(impl->h_orients_allocated, orients, size * sizeof(bool)); 487 impl->h_orients = impl->h_orients_allocated; 488 break; 489 } 490 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_orients, size * sizeof(bool))); 491 impl->d_orients_allocated = impl->d_orients; // We own the device memory 492 CeedCallCuda(ceed, cudaMemcpy(impl->d_orients, orients, size * sizeof(bool), cudaMemcpyHostToDevice)); 493 } break; 494 case CEED_MEM_DEVICE: { 495 switch (copy_mode) { 496 case CEED_COPY_VALUES: 497 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_orients, size * sizeof(bool))); 498 impl->d_orients_allocated = impl->d_orients; // We own the device memory 499 CeedCallCuda(ceed, cudaMemcpy(impl->d_orients, orients, size * sizeof(bool), cudaMemcpyDeviceToDevice)); 500 break; 501 case CEED_OWN_POINTER: 502 impl->d_orients = (bool *)orients; 503 impl->d_orients_allocated = impl->d_orients; 504 break; 505 case CEED_USE_POINTER: 506 impl->d_orients = (bool *)orients; 507 break; 508 } 509 CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); 510 CeedCallCuda(ceed, cudaMemcpy(impl->h_orients_allocated, impl->d_orients, size * sizeof(bool), cudaMemcpyDeviceToHost)); 511 impl->h_orients = impl->h_orients_allocated; 512 } break; 513 } 514 } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) { 515 switch (mem_type) { 516 case CEED_MEM_HOST: { 517 switch (copy_mode) { 518 case CEED_OWN_POINTER: 519 impl->h_curl_orients_allocated = (CeedInt8 *)curl_orients; 520 impl->h_curl_orients = (CeedInt8 *)curl_orients; 521 break; 522 case CEED_USE_POINTER: 523 impl->h_curl_orients = (CeedInt8 *)curl_orients; 524 break; 525 case CEED_COPY_VALUES: 526 CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); 527 memcpy(impl->h_curl_orients_allocated, curl_orients, 3 * size * sizeof(CeedInt8)); 528 impl->h_curl_orients = impl->h_curl_orients_allocated; 529 break; 530 } 531 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); 532 impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory 533 CeedCallCuda(ceed, cudaMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyHostToDevice)); 534 } break; 535 case CEED_MEM_DEVICE: { 536 switch (copy_mode) { 537 case CEED_COPY_VALUES: 538 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); 539 impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory 540 CeedCallCuda(ceed, cudaMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyDeviceToDevice)); 541 break; 542 case CEED_OWN_POINTER: 543 impl->d_curl_orients = (CeedInt8 *)curl_orients; 544 impl->d_curl_orients_allocated = impl->d_curl_orients; 545 break; 546 case CEED_USE_POINTER: 547 impl->d_curl_orients = (CeedInt8 *)curl_orients; 548 break; 549 } 550 CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); 551 CeedCallCuda(ceed, cudaMemcpy(impl->h_curl_orients_allocated, impl->d_curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyDeviceToHost)); 552 impl->h_curl_orients = impl->h_curl_orients_allocated; 553 } break; 554 } 555 } 556 } 557 558 // Compile CUDA kernels 559 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path)); 560 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 561 CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 562 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 563 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 564 "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", 565 strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); 566 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); 567 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); 568 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); 569 if (!is_deterministic) { 570 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); 571 } else { 572 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet)); 573 } 574 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedNoTranspose", &impl->OrientedNoTranspose)); 575 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->OrientedTranspose)); 576 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedNoTranspose", &impl->CurlOrientedNoTranspose)); 577 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->CurlOrientedUnsignedNoTranspose)); 578 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->CurlOrientedTranspose)); 579 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->CurlOrientedUnsignedTranspose)); 580 CeedCallBackend(CeedFree(&restriction_kernel_path)); 581 CeedCallBackend(CeedFree(&restriction_kernel_source)); 582 583 // Register backend functions 584 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Cuda)); 585 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Cuda)); 586 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Cuda)); 587 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); 588 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Cuda)); 589 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Cuda)); 590 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda)); 591 return CEED_ERROR_SUCCESS; 592 } 593 594 //------------------------------------------------------------------------------ 595