1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <cuda.h> 12 #include <cuda_runtime.h> 13 #include <stdbool.h> 14 #include <stddef.h> 15 #include <string.h> 16 17 #include "../cuda/ceed-cuda-common.h" 18 #include "../cuda/ceed-cuda-compile.h" 19 #include "ceed-cuda-ref.h" 20 21 //------------------------------------------------------------------------------ 22 // Core apply restriction code 23 //------------------------------------------------------------------------------ 24 static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients, 25 CeedVector u, CeedVector v, CeedRequest *request) { 26 Ceed ceed; 27 CeedInt num_elem, elem_size; 28 CeedRestrictionType rstr_type; 29 const CeedScalar *d_u; 30 CeedScalar *d_v; 31 CeedElemRestriction_Cuda *impl; 32 CUfunction kernel; 33 34 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 35 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 36 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 37 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 38 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 39 const CeedInt num_nodes = impl->num_nodes; 40 41 // Get vectors 42 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 43 if (t_mode == CEED_TRANSPOSE) { 44 // Sum into for transpose mode, e-vec to l-vec 45 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 46 } else { 47 // Overwrite for notranspose mode, l-vec to e-vec 48 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 49 } 50 51 // Restrict 52 if (t_mode == CEED_NOTRANSPOSE) { 53 // L-vector -> E-vector 54 const CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; 55 const CeedInt grid = CeedDivUpInt(num_nodes, block_size); 56 57 switch (rstr_type) { 58 case CEED_RESTRICTION_STRIDED: { 59 kernel = impl->StridedNoTranspose; 60 void *args[] = {&num_elem, &d_u, &d_v}; 61 62 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 63 } break; 64 case CEED_RESTRICTION_STANDARD: { 65 kernel = impl->OffsetNoTranspose; 66 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 67 68 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 69 } break; 70 case CEED_RESTRICTION_ORIENTED: { 71 if (use_signs) { 72 kernel = impl->OrientedNoTranspose; 73 void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; 74 75 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 76 } else { 77 kernel = impl->OffsetNoTranspose; 78 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 79 80 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 81 } 82 } break; 83 case CEED_RESTRICTION_CURL_ORIENTED: { 84 if (use_signs && use_orients) { 85 kernel = impl->CurlOrientedNoTranspose; 86 void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; 87 88 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 89 } else if (use_orients) { 90 kernel = impl->CurlOrientedUnsignedNoTranspose; 91 void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; 92 93 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 94 } else { 95 kernel = impl->OffsetNoTranspose; 96 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 97 98 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 99 } 100 } break; 101 } 102 } else { 103 // E-vector -> L-vector 104 const CeedInt block_size = 32; 105 const CeedInt grid = CeedDivUpInt(num_nodes, block_size); 106 107 switch (rstr_type) { 108 case CEED_RESTRICTION_STRIDED: { 109 kernel = impl->StridedTranspose; 110 void *args[] = {&num_elem, &d_u, &d_v}; 111 112 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 113 } break; 114 case CEED_RESTRICTION_STANDARD: { 115 if (impl->OffsetTranspose) { 116 kernel = impl->OffsetTranspose; 117 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 118 119 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 120 } else { 121 kernel = impl->OffsetTransposeDet; 122 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 123 124 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 125 } 126 } break; 127 case CEED_RESTRICTION_ORIENTED: { 128 if (use_signs) { 129 kernel = impl->OrientedTranspose; 130 void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; 131 132 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 133 } else { 134 if (impl->OffsetTranspose) { 135 kernel = impl->OffsetTranspose; 136 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 137 138 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 139 } else { 140 kernel = impl->OffsetTransposeDet; 141 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 142 143 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 144 } 145 } 146 } break; 147 case CEED_RESTRICTION_CURL_ORIENTED: { 148 if (use_signs && use_orients) { 149 kernel = impl->CurlOrientedTranspose; 150 void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; 151 152 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 153 } else if (use_orients) { 154 kernel = impl->CurlOrientedUnsignedTranspose; 155 void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; 156 157 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 158 } else { 159 if (impl->OffsetTranspose) { 160 kernel = impl->OffsetTranspose; 161 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 162 163 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 164 } else { 165 kernel = impl->OffsetTransposeDet; 166 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 167 168 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); 169 } 170 } 171 } break; 172 } 173 } 174 175 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; 176 177 // Restore arrays 178 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 179 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 180 return CEED_ERROR_SUCCESS; 181 } 182 183 //------------------------------------------------------------------------------ 184 // Apply restriction 185 //------------------------------------------------------------------------------ 186 static int CeedElemRestrictionApply_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { 187 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, true, true, u, v, request); 188 } 189 190 //------------------------------------------------------------------------------ 191 // Apply unsigned restriction 192 //------------------------------------------------------------------------------ 193 static int CeedElemRestrictionApplyUnsigned_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 194 CeedRequest *request) { 195 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, true, u, v, request); 196 } 197 198 //------------------------------------------------------------------------------ 199 // Apply unoriented restriction 200 //------------------------------------------------------------------------------ 201 static int CeedElemRestrictionApplyUnoriented_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 202 CeedRequest *request) { 203 return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, false, u, v, request); 204 } 205 206 //------------------------------------------------------------------------------ 207 // Get offsets 208 //------------------------------------------------------------------------------ 209 static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { 210 CeedElemRestriction_Cuda *impl; 211 212 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 213 switch (mem_type) { 214 case CEED_MEM_HOST: 215 *offsets = impl->h_ind; 216 break; 217 case CEED_MEM_DEVICE: 218 *offsets = impl->d_ind; 219 break; 220 } 221 return CEED_ERROR_SUCCESS; 222 } 223 224 //------------------------------------------------------------------------------ 225 // Get orientations 226 //------------------------------------------------------------------------------ 227 static int CeedElemRestrictionGetOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) { 228 CeedElemRestriction_Cuda *impl; 229 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 230 231 switch (mem_type) { 232 case CEED_MEM_HOST: 233 *orients = impl->h_orients; 234 break; 235 case CEED_MEM_DEVICE: 236 *orients = impl->d_orients; 237 break; 238 } 239 return CEED_ERROR_SUCCESS; 240 } 241 242 //------------------------------------------------------------------------------ 243 // Get curl-conforming orientations 244 //------------------------------------------------------------------------------ 245 static int CeedElemRestrictionGetCurlOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) { 246 CeedElemRestriction_Cuda *impl; 247 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 248 249 switch (mem_type) { 250 case CEED_MEM_HOST: 251 *curl_orients = impl->h_curl_orients; 252 break; 253 case CEED_MEM_DEVICE: 254 *curl_orients = impl->d_curl_orients; 255 break; 256 } 257 return CEED_ERROR_SUCCESS; 258 } 259 260 //------------------------------------------------------------------------------ 261 // Destroy restriction 262 //------------------------------------------------------------------------------ 263 static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) { 264 Ceed ceed; 265 CeedElemRestriction_Cuda *impl; 266 267 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 268 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 269 CeedCallCuda(ceed, cuModuleUnload(impl->module)); 270 CeedCallBackend(CeedFree(&impl->h_ind_allocated)); 271 CeedCallCuda(ceed, cudaFree(impl->d_ind_allocated)); 272 CeedCallCuda(ceed, cudaFree(impl->d_t_offsets)); 273 CeedCallCuda(ceed, cudaFree(impl->d_t_indices)); 274 CeedCallCuda(ceed, cudaFree(impl->d_l_vec_indices)); 275 CeedCallBackend(CeedFree(&impl->h_orients_allocated)); 276 CeedCallCuda(ceed, cudaFree(impl->d_orients_allocated)); 277 CeedCallBackend(CeedFree(&impl->h_curl_orients_allocated)); 278 CeedCallCuda(ceed, cudaFree(impl->d_curl_orients_allocated)); 279 CeedCallBackend(CeedFree(&impl)); 280 return CEED_ERROR_SUCCESS; 281 } 282 283 //------------------------------------------------------------------------------ 284 // Create transpose offsets and indices 285 //------------------------------------------------------------------------------ 286 static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt *indices) { 287 Ceed ceed; 288 bool *is_node; 289 CeedSize l_size; 290 CeedInt num_elem, elem_size, num_comp, num_nodes = 0; 291 CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; 292 CeedElemRestriction_Cuda *impl; 293 294 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 295 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 296 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 297 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 298 CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); 299 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 300 const CeedInt size_indices = num_elem * elem_size; 301 302 // Count num_nodes 303 CeedCallBackend(CeedCalloc(l_size, &is_node)); 304 305 for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; 306 for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; 307 impl->num_nodes = num_nodes; 308 309 // L-vector offsets array 310 CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); 311 CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); 312 for (CeedInt i = 0, j = 0; i < l_size; i++) { 313 if (is_node[i]) { 314 l_vec_indices[j] = i; 315 ind_to_offset[i] = j++; 316 } 317 } 318 CeedCallBackend(CeedFree(&is_node)); 319 320 // Compute transpose offsets and indices 321 const CeedInt size_offsets = num_nodes + 1; 322 323 CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); 324 CeedCallBackend(CeedMalloc(size_indices, &t_indices)); 325 // Count node multiplicity 326 for (CeedInt e = 0; e < num_elem; ++e) { 327 for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1]; 328 } 329 // Convert to running sum 330 for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1]; 331 // List all E-vec indices associated with L-vec node 332 for (CeedInt e = 0; e < num_elem; ++e) { 333 for (CeedInt i = 0; i < elem_size; ++i) { 334 const CeedInt lid = elem_size * e + i; 335 const CeedInt gid = indices[lid]; 336 337 t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 338 } 339 } 340 // Reset running sum 341 for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1]; 342 t_offsets[0] = 0; 343 344 // Copy data to device 345 // -- L-vector indices 346 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt))); 347 CeedCallCuda(ceed, cudaMemcpy(impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), cudaMemcpyHostToDevice)); 348 // -- Transpose offsets 349 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt))); 350 CeedCallCuda(ceed, cudaMemcpy(impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), cudaMemcpyHostToDevice)); 351 // -- Transpose indices 352 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt))); 353 CeedCallCuda(ceed, cudaMemcpy(impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), cudaMemcpyHostToDevice)); 354 355 // Cleanup 356 CeedCallBackend(CeedFree(&ind_to_offset)); 357 CeedCallBackend(CeedFree(&l_vec_indices)); 358 CeedCallBackend(CeedFree(&t_offsets)); 359 CeedCallBackend(CeedFree(&t_indices)); 360 return CEED_ERROR_SUCCESS; 361 } 362 363 //------------------------------------------------------------------------------ 364 // Create restriction 365 //------------------------------------------------------------------------------ 366 int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, 367 const CeedInt8 *curl_orients, CeedElemRestriction rstr) { 368 Ceed ceed, ceed_parent; 369 bool is_deterministic; 370 CeedInt num_elem, num_comp, elem_size, comp_stride = 1; 371 CeedRestrictionType rstr_type; 372 char *restriction_kernel_path, *restriction_kernel_source; 373 CeedElemRestriction_Cuda *impl; 374 375 CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 376 CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); 377 CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); 378 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 379 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 380 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 381 const CeedInt size = num_elem * elem_size; 382 CeedInt strides[3] = {1, size, elem_size}; 383 CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; 384 385 // Stride data 386 CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 387 if (rstr_type == CEED_RESTRICTION_STRIDED) { 388 bool has_backend_strides; 389 390 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 391 if (!has_backend_strides) { 392 CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); 393 } 394 } else { 395 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); 396 } 397 398 CeedCallBackend(CeedCalloc(1, &impl)); 399 impl->num_nodes = size; 400 impl->h_ind = NULL; 401 impl->h_ind_allocated = NULL; 402 impl->d_ind = NULL; 403 impl->d_ind_allocated = NULL; 404 impl->d_t_indices = NULL; 405 impl->d_t_offsets = NULL; 406 impl->h_orients = NULL; 407 impl->h_orients_allocated = NULL; 408 impl->d_orients = NULL; 409 impl->d_orients_allocated = NULL; 410 impl->h_curl_orients = NULL; 411 impl->h_curl_orients_allocated = NULL; 412 impl->d_curl_orients = NULL; 413 impl->d_curl_orients_allocated = NULL; 414 CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); 415 CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); 416 417 // Set up device offset/orientation arrays 418 if (rstr_type != CEED_RESTRICTION_STRIDED) { 419 switch (mem_type) { 420 case CEED_MEM_HOST: { 421 switch (copy_mode) { 422 case CEED_OWN_POINTER: 423 impl->h_ind_allocated = (CeedInt *)indices; 424 impl->h_ind = (CeedInt *)indices; 425 break; 426 case CEED_USE_POINTER: 427 impl->h_ind = (CeedInt *)indices; 428 break; 429 case CEED_COPY_VALUES: 430 CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); 431 memcpy(impl->h_ind_allocated, indices, size * sizeof(CeedInt)); 432 impl->h_ind = impl->h_ind_allocated; 433 break; 434 } 435 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 436 impl->d_ind_allocated = impl->d_ind; // We own the device memory 437 CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyHostToDevice)); 438 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, indices)); 439 } break; 440 case CEED_MEM_DEVICE: { 441 switch (copy_mode) { 442 case CEED_COPY_VALUES: 443 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 444 impl->d_ind_allocated = impl->d_ind; // We own the device memory 445 CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyDeviceToDevice)); 446 break; 447 case CEED_OWN_POINTER: 448 impl->d_ind = (CeedInt *)indices; 449 impl->d_ind_allocated = impl->d_ind; 450 break; 451 case CEED_USE_POINTER: 452 impl->d_ind = (CeedInt *)indices; 453 break; 454 } 455 CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); 456 CeedCallCuda(ceed, cudaMemcpy(impl->h_ind_allocated, impl->d_ind, size * sizeof(CeedInt), cudaMemcpyDeviceToHost)); 457 impl->h_ind = impl->h_ind_allocated; 458 if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, indices)); 459 } break; 460 } 461 462 // Orientation data 463 if (rstr_type == CEED_RESTRICTION_ORIENTED) { 464 switch (mem_type) { 465 case CEED_MEM_HOST: { 466 switch (copy_mode) { 467 case CEED_OWN_POINTER: 468 impl->h_orients_allocated = (bool *)orients; 469 impl->h_orients = (bool *)orients; 470 break; 471 case CEED_USE_POINTER: 472 impl->h_orients = (bool *)orients; 473 break; 474 case CEED_COPY_VALUES: 475 CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); 476 memcpy(impl->h_orients_allocated, orients, size * sizeof(bool)); 477 impl->h_orients = impl->h_orients_allocated; 478 break; 479 } 480 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_orients, size * sizeof(bool))); 481 impl->d_orients_allocated = impl->d_orients; // We own the device memory 482 CeedCallCuda(ceed, cudaMemcpy(impl->d_orients, orients, size * sizeof(bool), cudaMemcpyHostToDevice)); 483 } break; 484 case CEED_MEM_DEVICE: { 485 switch (copy_mode) { 486 case CEED_COPY_VALUES: 487 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_orients, size * sizeof(bool))); 488 impl->d_orients_allocated = impl->d_orients; // We own the device memory 489 CeedCallCuda(ceed, cudaMemcpy(impl->d_orients, orients, size * sizeof(bool), cudaMemcpyDeviceToDevice)); 490 break; 491 case CEED_OWN_POINTER: 492 impl->d_orients = (bool *)orients; 493 impl->d_orients_allocated = impl->d_orients; 494 break; 495 case CEED_USE_POINTER: 496 impl->d_orients = (bool *)orients; 497 break; 498 } 499 CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); 500 CeedCallCuda(ceed, cudaMemcpy(impl->h_orients_allocated, impl->d_orients, size * sizeof(bool), cudaMemcpyDeviceToHost)); 501 impl->h_orients = impl->h_orients_allocated; 502 } break; 503 } 504 } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) { 505 switch (mem_type) { 506 case CEED_MEM_HOST: { 507 switch (copy_mode) { 508 case CEED_OWN_POINTER: 509 impl->h_curl_orients_allocated = (CeedInt8 *)curl_orients; 510 impl->h_curl_orients = (CeedInt8 *)curl_orients; 511 break; 512 case CEED_USE_POINTER: 513 impl->h_curl_orients = (CeedInt8 *)curl_orients; 514 break; 515 case CEED_COPY_VALUES: 516 CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); 517 memcpy(impl->h_curl_orients_allocated, curl_orients, 3 * size * sizeof(CeedInt8)); 518 impl->h_curl_orients = impl->h_curl_orients_allocated; 519 break; 520 } 521 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); 522 impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory 523 CeedCallCuda(ceed, cudaMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyHostToDevice)); 524 } break; 525 case CEED_MEM_DEVICE: { 526 switch (copy_mode) { 527 case CEED_COPY_VALUES: 528 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); 529 impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory 530 CeedCallCuda(ceed, cudaMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyDeviceToDevice)); 531 break; 532 case CEED_OWN_POINTER: 533 impl->d_curl_orients = (CeedInt8 *)curl_orients; 534 impl->d_curl_orients_allocated = impl->d_curl_orients; 535 break; 536 case CEED_USE_POINTER: 537 impl->d_curl_orients = (CeedInt8 *)curl_orients; 538 break; 539 } 540 CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); 541 CeedCallCuda(ceed, cudaMemcpy(impl->h_curl_orients_allocated, impl->d_curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyDeviceToHost)); 542 impl->h_curl_orients = impl->h_curl_orients_allocated; 543 } break; 544 } 545 } 546 } 547 548 // Compile CUDA kernels 549 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path)); 550 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); 551 CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 552 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); 553 CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, 554 "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", 555 strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); 556 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); 557 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); 558 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); 559 if (!is_deterministic) { 560 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); 561 } else { 562 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet)); 563 } 564 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedNoTranspose", &impl->OrientedNoTranspose)); 565 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->OrientedTranspose)); 566 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedNoTranspose", &impl->CurlOrientedNoTranspose)); 567 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->CurlOrientedUnsignedNoTranspose)); 568 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->CurlOrientedTranspose)); 569 CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->CurlOrientedUnsignedTranspose)); 570 CeedCallBackend(CeedFree(&restriction_kernel_path)); 571 CeedCallBackend(CeedFree(&restriction_kernel_source)); 572 573 // Register backend functions 574 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Cuda)); 575 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Cuda)); 576 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Cuda)); 577 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); 578 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Cuda)); 579 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Cuda)); 580 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda)); 581 return CEED_ERROR_SUCCESS; 582 } 583 584 //------------------------------------------------------------------------------ 585