1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <cuda_runtime.h> 11 #include <math.h> 12 #include <stdbool.h> 13 #include <string.h> 14 15 #include "../cuda/ceed-cuda-common.h" 16 #include "ceed-cuda-ref.h" 17 18 //------------------------------------------------------------------------------ 19 // Check if host/device sync is needed 20 //------------------------------------------------------------------------------ 21 static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_type, bool *need_sync) { 22 bool has_valid_array = false; 23 CeedVector_Cuda *impl; 24 25 CeedCallBackend(CeedVectorGetData(vec, &impl)); 26 CeedCallBackend(CeedVectorHasValidArray(vec, &has_valid_array)); 27 switch (mem_type) { 28 case CEED_MEM_HOST: 29 *need_sync = has_valid_array && !impl->h_array; 30 break; 31 case CEED_MEM_DEVICE: 32 *need_sync = has_valid_array && !impl->d_array; 33 break; 34 } 35 return CEED_ERROR_SUCCESS; 36 } 37 38 //------------------------------------------------------------------------------ 39 // Sync host to device 40 //------------------------------------------------------------------------------ 41 static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) { 42 CeedSize length; 43 size_t bytes; 44 Ceed ceed; 45 CeedVector_Cuda *impl; 46 47 CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); 48 CeedCallBackend(CeedVectorGetData(vec, &impl)); 49 50 CeedCheck(impl->h_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid host data to sync to device"); 51 52 CeedCallBackend(CeedVectorGetLength(vec, &length)); 53 bytes = length * sizeof(CeedScalar); 54 if (impl->d_array_borrowed) { 55 impl->d_array = impl->d_array_borrowed; 56 } else if (impl->d_array_owned) { 57 impl->d_array = impl->d_array_owned; 58 } else { 59 CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes)); 60 impl->d_array = impl->d_array_owned; 61 } 62 CeedCallCuda(ceed, cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice)); 63 return CEED_ERROR_SUCCESS; 64 } 65 66 //------------------------------------------------------------------------------ 67 // Sync device to host 68 //------------------------------------------------------------------------------ 69 static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) { 70 CeedSize length; 71 Ceed ceed; 72 CeedVector_Cuda *impl; 73 74 CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); 75 CeedCallBackend(CeedVectorGetData(vec, &impl)); 76 77 CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); 78 79 if (impl->h_array_borrowed) { 80 impl->h_array = impl->h_array_borrowed; 81 } else if (impl->h_array_owned) { 82 impl->h_array = impl->h_array_owned; 83 } else { 84 CeedSize length; 85 86 CeedCallBackend(CeedVectorGetLength(vec, &length)); 87 CeedCallBackend(CeedCalloc(length, &impl->h_array_owned)); 88 impl->h_array = impl->h_array_owned; 89 } 90 91 CeedCallBackend(CeedVectorGetLength(vec, &length)); 92 size_t bytes = length * sizeof(CeedScalar); 93 94 CeedCallCuda(ceed, cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost)); 95 return CEED_ERROR_SUCCESS; 96 } 97 98 //------------------------------------------------------------------------------ 99 // Sync arrays 100 //------------------------------------------------------------------------------ 101 static int CeedVectorSyncArray_Cuda(const CeedVector vec, CeedMemType mem_type) { 102 bool need_sync = false; 103 104 // Check whether device/host sync is needed 105 CeedCallBackend(CeedVectorNeedSync_Cuda(vec, mem_type, &need_sync)); 106 if (!need_sync) return CEED_ERROR_SUCCESS; 107 108 switch (mem_type) { 109 case CEED_MEM_HOST: 110 return CeedVectorSyncD2H_Cuda(vec); 111 case CEED_MEM_DEVICE: 112 return CeedVectorSyncH2D_Cuda(vec); 113 } 114 return CEED_ERROR_UNSUPPORTED; 115 } 116 117 //------------------------------------------------------------------------------ 118 // Set all pointers as invalid 119 //------------------------------------------------------------------------------ 120 static inline int CeedVectorSetAllInvalid_Cuda(const CeedVector vec) { 121 CeedVector_Cuda *impl; 122 123 CeedCallBackend(CeedVectorGetData(vec, &impl)); 124 impl->h_array = NULL; 125 impl->d_array = NULL; 126 return CEED_ERROR_SUCCESS; 127 } 128 129 //------------------------------------------------------------------------------ 130 // Check if CeedVector has any valid pointer 131 //------------------------------------------------------------------------------ 132 static inline int CeedVectorHasValidArray_Cuda(const CeedVector vec, bool *has_valid_array) { 133 CeedVector_Cuda *impl; 134 135 CeedCallBackend(CeedVectorGetData(vec, &impl)); 136 *has_valid_array = impl->h_array || impl->d_array; 137 return CEED_ERROR_SUCCESS; 138 } 139 140 //------------------------------------------------------------------------------ 141 // Check if has array of given type 142 //------------------------------------------------------------------------------ 143 static inline int CeedVectorHasArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_array_of_type) { 144 CeedVector_Cuda *impl; 145 146 CeedCallBackend(CeedVectorGetData(vec, &impl)); 147 switch (mem_type) { 148 case CEED_MEM_HOST: 149 *has_array_of_type = impl->h_array_borrowed || impl->h_array_owned; 150 break; 151 case CEED_MEM_DEVICE: 152 *has_array_of_type = impl->d_array_borrowed || impl->d_array_owned; 153 break; 154 } 155 return CEED_ERROR_SUCCESS; 156 } 157 158 //------------------------------------------------------------------------------ 159 // Check if has borrowed array of given type 160 //------------------------------------------------------------------------------ 161 static inline int CeedVectorHasBorrowedArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { 162 CeedVector_Cuda *impl; 163 164 CeedCallBackend(CeedVectorGetData(vec, &impl)); 165 switch (mem_type) { 166 case CEED_MEM_HOST: 167 *has_borrowed_array_of_type = impl->h_array_borrowed; 168 break; 169 case CEED_MEM_DEVICE: 170 *has_borrowed_array_of_type = impl->d_array_borrowed; 171 break; 172 } 173 return CEED_ERROR_SUCCESS; 174 } 175 176 //------------------------------------------------------------------------------ 177 // Set array from host 178 //------------------------------------------------------------------------------ 179 static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { 180 CeedSize length; 181 CeedVector_Cuda *impl; 182 183 CeedCallBackend(CeedVectorGetData(vec, &impl)); 184 CeedCallBackend(CeedVectorGetLength(vec, &length)); 185 186 CeedCallBackend(CeedSetHostCeedScalarArray(array, copy_mode, length, (const CeedScalar **)&impl->h_array_owned, 187 (const CeedScalar **)&impl->h_array_borrowed, (const CeedScalar **)&impl->h_array)); 188 return CEED_ERROR_SUCCESS; 189 } 190 191 //------------------------------------------------------------------------------ 192 // Set array from device 193 //------------------------------------------------------------------------------ 194 static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { 195 CeedSize length; 196 Ceed ceed; 197 CeedVector_Cuda *impl; 198 199 CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); 200 CeedCallBackend(CeedVectorGetData(vec, &impl)); 201 CeedCallBackend(CeedVectorGetLength(vec, &length)); 202 203 CeedCallBackend(CeedSetDeviceCeedScalarArray_Cuda(ceed, array, copy_mode, length, (const CeedScalar **)&impl->d_array_owned, 204 (const CeedScalar **)&impl->d_array_borrowed, (const CeedScalar **)&impl->d_array)); 205 return CEED_ERROR_SUCCESS; 206 } 207 208 //------------------------------------------------------------------------------ 209 // Set the array used by a vector, 210 // freeing any previously allocated array if applicable 211 //------------------------------------------------------------------------------ 212 static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedCopyMode copy_mode, CeedScalar *array) { 213 CeedVector_Cuda *impl; 214 215 CeedCallBackend(CeedVectorGetData(vec, &impl)); 216 CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec)); 217 switch (mem_type) { 218 case CEED_MEM_HOST: 219 return CeedVectorSetArrayHost_Cuda(vec, copy_mode, array); 220 case CEED_MEM_DEVICE: 221 return CeedVectorSetArrayDevice_Cuda(vec, copy_mode, array); 222 } 223 return CEED_ERROR_UNSUPPORTED; 224 } 225 226 //------------------------------------------------------------------------------ 227 // Set host array to value 228 //------------------------------------------------------------------------------ 229 static int CeedHostSetValue_Cuda(CeedScalar *h_array, CeedSize length, CeedScalar val) { 230 for (CeedSize i = 0; i < length; i++) h_array[i] = val; 231 return CEED_ERROR_SUCCESS; 232 } 233 234 //------------------------------------------------------------------------------ 235 // Set device array to value (impl in .cu file) 236 //------------------------------------------------------------------------------ 237 int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val); 238 239 //------------------------------------------------------------------------------ 240 // Set a vector to a value 241 //------------------------------------------------------------------------------ 242 static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) { 243 CeedSize length; 244 CeedVector_Cuda *impl; 245 246 CeedCallBackend(CeedVectorGetData(vec, &impl)); 247 CeedCallBackend(CeedVectorGetLength(vec, &length)); 248 // Set value for synced device/host array 249 if (!impl->d_array && !impl->h_array) { 250 if (impl->d_array_borrowed) { 251 impl->d_array = impl->d_array_borrowed; 252 } else if (impl->h_array_borrowed) { 253 impl->h_array = impl->h_array_borrowed; 254 } else if (impl->d_array_owned) { 255 impl->d_array = impl->d_array_owned; 256 } else if (impl->h_array_owned) { 257 impl->h_array = impl->h_array_owned; 258 } else { 259 CeedCallBackend(CeedVectorSetArray(vec, CEED_MEM_DEVICE, CEED_COPY_VALUES, NULL)); 260 } 261 } 262 if (impl->d_array) { 263 CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val)); 264 impl->h_array = NULL; 265 } 266 if (impl->h_array) { 267 CeedCallBackend(CeedHostSetValue_Cuda(impl->h_array, length, val)); 268 impl->d_array = NULL; 269 } 270 return CEED_ERROR_SUCCESS; 271 } 272 273 //------------------------------------------------------------------------------ 274 // Vector Take Array 275 //------------------------------------------------------------------------------ 276 static int CeedVectorTakeArray_Cuda(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { 277 CeedVector_Cuda *impl; 278 279 CeedCallBackend(CeedVectorGetData(vec, &impl)); 280 // Sync array to requested mem_type 281 CeedCallBackend(CeedVectorSyncArray(vec, mem_type)); 282 // Update pointer 283 switch (mem_type) { 284 case CEED_MEM_HOST: 285 (*array) = impl->h_array_borrowed; 286 impl->h_array_borrowed = NULL; 287 impl->h_array = NULL; 288 break; 289 case CEED_MEM_DEVICE: 290 (*array) = impl->d_array_borrowed; 291 impl->d_array_borrowed = NULL; 292 impl->d_array = NULL; 293 break; 294 } 295 return CEED_ERROR_SUCCESS; 296 } 297 298 //------------------------------------------------------------------------------ 299 // Core logic for array syncronization for GetArray. 300 // If a different memory type is most up to date, this will perform a copy 301 //------------------------------------------------------------------------------ 302 static int CeedVectorGetArrayCore_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { 303 CeedVector_Cuda *impl; 304 305 CeedCallBackend(CeedVectorGetData(vec, &impl)); 306 // Sync array to requested mem_type 307 CeedCallBackend(CeedVectorSyncArray(vec, mem_type)); 308 // Update pointer 309 switch (mem_type) { 310 case CEED_MEM_HOST: 311 *array = impl->h_array; 312 break; 313 case CEED_MEM_DEVICE: 314 *array = impl->d_array; 315 break; 316 } 317 return CEED_ERROR_SUCCESS; 318 } 319 320 //------------------------------------------------------------------------------ 321 // Get read-only access to a vector via the specified mem_type 322 //------------------------------------------------------------------------------ 323 static int CeedVectorGetArrayRead_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedScalar **array) { 324 return CeedVectorGetArrayCore_Cuda(vec, mem_type, (CeedScalar **)array); 325 } 326 327 //------------------------------------------------------------------------------ 328 // Get read/write access to a vector via the specified mem_type 329 //------------------------------------------------------------------------------ 330 static int CeedVectorGetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { 331 CeedVector_Cuda *impl; 332 333 CeedCallBackend(CeedVectorGetData(vec, &impl)); 334 CeedCallBackend(CeedVectorGetArrayCore_Cuda(vec, mem_type, array)); 335 CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec)); 336 switch (mem_type) { 337 case CEED_MEM_HOST: 338 impl->h_array = *array; 339 break; 340 case CEED_MEM_DEVICE: 341 impl->d_array = *array; 342 break; 343 } 344 return CEED_ERROR_SUCCESS; 345 } 346 347 //------------------------------------------------------------------------------ 348 // Get write access to a vector via the specified mem_type 349 //------------------------------------------------------------------------------ 350 static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { 351 bool has_array_of_type = true; 352 CeedVector_Cuda *impl; 353 354 CeedCallBackend(CeedVectorGetData(vec, &impl)); 355 CeedCallBackend(CeedVectorHasArrayOfType_Cuda(vec, mem_type, &has_array_of_type)); 356 if (!has_array_of_type) { 357 // Allocate if array is not yet allocated 358 CeedCallBackend(CeedVectorSetArray(vec, mem_type, CEED_COPY_VALUES, NULL)); 359 } else { 360 // Select dirty array 361 switch (mem_type) { 362 case CEED_MEM_HOST: 363 if (impl->h_array_borrowed) impl->h_array = impl->h_array_borrowed; 364 else impl->h_array = impl->h_array_owned; 365 break; 366 case CEED_MEM_DEVICE: 367 if (impl->d_array_borrowed) impl->d_array = impl->d_array_borrowed; 368 else impl->d_array = impl->d_array_owned; 369 } 370 } 371 return CeedVectorGetArray_Cuda(vec, mem_type, array); 372 } 373 374 //------------------------------------------------------------------------------ 375 // Get the norm of a CeedVector 376 //------------------------------------------------------------------------------ 377 static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *norm) { 378 Ceed ceed; 379 CeedSize length; 380 #if CUDA_VERSION < 12000 381 CeedSize num_calls; 382 #endif 383 const CeedScalar *d_array; 384 CeedVector_Cuda *impl; 385 cublasHandle_t handle; 386 387 CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); 388 CeedCallBackend(CeedVectorGetData(vec, &impl)); 389 CeedCallBackend(CeedVectorGetLength(vec, &length)); 390 CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle)); 391 392 #if CUDA_VERSION < 12000 393 // With CUDA 12, we can use the 64-bit integer interface. Prior to that, 394 // we need to check if the vector is too long to handle with int32, 395 // and if so, divide it into subsections for repeated cuBLAS calls. 396 num_calls = length / INT_MAX; 397 if (length % INT_MAX > 0) num_calls += 1; 398 #endif 399 400 // Compute norm 401 CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array)); 402 switch (type) { 403 case CEED_NORM_1: { 404 *norm = 0.0; 405 if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 406 #if CUDA_VERSION >= 12000 // We have CUDA 12, and can use 64-bit integers 407 CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm)); 408 #else 409 float sub_norm = 0.0; 410 float *d_array_start; 411 412 for (CeedInt i = 0; i < num_calls; i++) { 413 d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; 414 CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; 415 CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; 416 417 CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); 418 *norm += sub_norm; 419 } 420 #endif 421 } else { 422 #if CUDA_VERSION >= 12000 423 CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm)); 424 #else 425 double sub_norm = 0.0; 426 double *d_array_start; 427 428 for (CeedInt i = 0; i < num_calls; i++) { 429 d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; 430 CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; 431 CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; 432 433 CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); 434 *norm += sub_norm; 435 } 436 #endif 437 } 438 break; 439 } 440 case CEED_NORM_2: { 441 if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 442 #if CUDA_VERSION >= 12000 443 CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm)); 444 #else 445 float sub_norm = 0.0, norm_sum = 0.0; 446 float *d_array_start; 447 448 for (CeedInt i = 0; i < num_calls; i++) { 449 d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; 450 CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; 451 CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; 452 453 CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); 454 norm_sum += sub_norm * sub_norm; 455 } 456 *norm = sqrt(norm_sum); 457 #endif 458 } else { 459 #if CUDA_VERSION >= 12000 460 CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm)); 461 #else 462 double sub_norm = 0.0, norm_sum = 0.0; 463 double *d_array_start; 464 465 for (CeedInt i = 0; i < num_calls; i++) { 466 d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; 467 CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; 468 CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; 469 470 CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); 471 norm_sum += sub_norm * sub_norm; 472 } 473 *norm = sqrt(norm_sum); 474 #endif 475 } 476 break; 477 } 478 case CEED_NORM_MAX: { 479 if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { 480 #if CUDA_VERSION >= 12000 481 int64_t index; 482 CeedScalar norm_no_abs; 483 484 CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index)); 485 CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); 486 *norm = fabs(norm_no_abs); 487 #else 488 CeedInt index; 489 float sub_max = 0.0, current_max = 0.0; 490 float *d_array_start; 491 492 for (CeedInt i = 0; i < num_calls; i++) { 493 d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; 494 CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; 495 CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; 496 497 CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index)); 498 CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); 499 if (fabs(sub_max) > current_max) current_max = fabs(sub_max); 500 } 501 *norm = current_max; 502 #endif 503 } else { 504 #if CUDA_VERSION >= 12000 505 int64_t index; 506 CeedScalar norm_no_abs; 507 508 CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index)); 509 CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); 510 *norm = fabs(norm_no_abs); 511 #else 512 CeedInt index; 513 double sub_max = 0.0, current_max = 0.0; 514 double *d_array_start; 515 516 for (CeedInt i = 0; i < num_calls; i++) { 517 d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; 518 CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; 519 CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; 520 521 CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index)); 522 CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); 523 if (fabs(sub_max) > current_max) current_max = fabs(sub_max); 524 } 525 *norm = current_max; 526 #endif 527 } 528 break; 529 } 530 } 531 CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array)); 532 return CEED_ERROR_SUCCESS; 533 } 534 535 //------------------------------------------------------------------------------ 536 // Take reciprocal of a vector on host 537 //------------------------------------------------------------------------------ 538 static int CeedHostReciprocal_Cuda(CeedScalar *h_array, CeedSize length) { 539 for (CeedSize i = 0; i < length; i++) { 540 if (fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i]; 541 } 542 return CEED_ERROR_SUCCESS; 543 } 544 545 //------------------------------------------------------------------------------ 546 // Take reciprocal of a vector on device (impl in .cu file) 547 //------------------------------------------------------------------------------ 548 int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length); 549 550 //------------------------------------------------------------------------------ 551 // Take reciprocal of a vector 552 //------------------------------------------------------------------------------ 553 static int CeedVectorReciprocal_Cuda(CeedVector vec) { 554 CeedSize length; 555 CeedVector_Cuda *impl; 556 557 CeedCallBackend(CeedVectorGetData(vec, &impl)); 558 CeedCallBackend(CeedVectorGetLength(vec, &length)); 559 // Set value for synced device/host array 560 if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Cuda(impl->d_array, length)); 561 if (impl->h_array) CeedCallBackend(CeedHostReciprocal_Cuda(impl->h_array, length)); 562 return CEED_ERROR_SUCCESS; 563 } 564 565 //------------------------------------------------------------------------------ 566 // Compute x = alpha x on the host 567 //------------------------------------------------------------------------------ 568 static int CeedHostScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { 569 for (CeedSize i = 0; i < length; i++) x_array[i] *= alpha; 570 return CEED_ERROR_SUCCESS; 571 } 572 573 //------------------------------------------------------------------------------ 574 // Compute x = alpha x on device (impl in .cu file) 575 //------------------------------------------------------------------------------ 576 int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length); 577 578 //------------------------------------------------------------------------------ 579 // Compute x = alpha x 580 //------------------------------------------------------------------------------ 581 static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) { 582 CeedSize length; 583 CeedVector_Cuda *x_impl; 584 585 CeedCallBackend(CeedVectorGetData(x, &x_impl)); 586 CeedCallBackend(CeedVectorGetLength(x, &length)); 587 // Set value for synced device/host array 588 if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Cuda(x_impl->d_array, alpha, length)); 589 if (x_impl->h_array) CeedCallBackend(CeedHostScale_Cuda(x_impl->h_array, alpha, length)); 590 return CEED_ERROR_SUCCESS; 591 } 592 593 //------------------------------------------------------------------------------ 594 // Compute y = alpha x + y on the host 595 //------------------------------------------------------------------------------ 596 static int CeedHostAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { 597 for (CeedSize i = 0; i < length; i++) y_array[i] += alpha * x_array[i]; 598 return CEED_ERROR_SUCCESS; 599 } 600 601 //------------------------------------------------------------------------------ 602 // Compute y = alpha x + y on device (impl in .cu file) 603 //------------------------------------------------------------------------------ 604 int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length); 605 606 //------------------------------------------------------------------------------ 607 // Compute y = alpha x + y 608 //------------------------------------------------------------------------------ 609 static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) { 610 Ceed ceed; 611 CeedSize length; 612 CeedVector_Cuda *y_impl, *x_impl; 613 614 CeedCallBackend(CeedVectorGetCeed(y, &ceed)); 615 CeedCallBackend(CeedVectorGetData(y, &y_impl)); 616 CeedCallBackend(CeedVectorGetData(x, &x_impl)); 617 CeedCallBackend(CeedVectorGetLength(y, &length)); 618 // Set value for synced device/host array 619 if (y_impl->d_array) { 620 CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); 621 CeedCallBackend(CeedDeviceAXPY_Cuda(y_impl->d_array, alpha, x_impl->d_array, length)); 622 } 623 if (y_impl->h_array) { 624 CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); 625 CeedCallBackend(CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length)); 626 } 627 return CEED_ERROR_SUCCESS; 628 } 629 630 //------------------------------------------------------------------------------ 631 // Compute y = alpha x + beta y on the host 632 //------------------------------------------------------------------------------ 633 static int CeedHostAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) { 634 for (CeedSize i = 0; i < length; i++) y_array[i] = alpha * x_array[i] + beta * y_array[i]; 635 return CEED_ERROR_SUCCESS; 636 } 637 638 //------------------------------------------------------------------------------ 639 // Compute y = alpha x + beta y on device (impl in .cu file) 640 //------------------------------------------------------------------------------ 641 int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length); 642 643 //------------------------------------------------------------------------------ 644 // Compute y = alpha x + beta y 645 //------------------------------------------------------------------------------ 646 static int CeedVectorAXPBY_Cuda(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x) { 647 CeedSize length; 648 CeedVector_Cuda *y_impl, *x_impl; 649 650 CeedCallBackend(CeedVectorGetData(y, &y_impl)); 651 CeedCallBackend(CeedVectorGetData(x, &x_impl)); 652 CeedCallBackend(CeedVectorGetLength(y, &length)); 653 // Set value for synced device/host array 654 if (y_impl->d_array) { 655 CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); 656 CeedCallBackend(CeedDeviceAXPBY_Cuda(y_impl->d_array, alpha, beta, x_impl->d_array, length)); 657 } 658 if (y_impl->h_array) { 659 CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); 660 CeedCallBackend(CeedHostAXPBY_Cuda(y_impl->h_array, alpha, beta, x_impl->h_array, length)); 661 } 662 return CEED_ERROR_SUCCESS; 663 } 664 665 //------------------------------------------------------------------------------ 666 // Compute the pointwise multiplication w = x .* y on the host 667 //------------------------------------------------------------------------------ 668 static int CeedHostPointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { 669 for (CeedSize i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i]; 670 return CEED_ERROR_SUCCESS; 671 } 672 673 //------------------------------------------------------------------------------ 674 // Compute the pointwise multiplication w = x .* y on device (impl in .cu file) 675 //------------------------------------------------------------------------------ 676 int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length); 677 678 //------------------------------------------------------------------------------ 679 // Compute the pointwise multiplication w = x .* y 680 //------------------------------------------------------------------------------ 681 static int CeedVectorPointwiseMult_Cuda(CeedVector w, CeedVector x, CeedVector y) { 682 CeedSize length; 683 CeedVector_Cuda *w_impl, *x_impl, *y_impl; 684 685 CeedCallBackend(CeedVectorGetData(w, &w_impl)); 686 CeedCallBackend(CeedVectorGetData(x, &x_impl)); 687 CeedCallBackend(CeedVectorGetData(y, &y_impl)); 688 CeedCallBackend(CeedVectorGetLength(w, &length)); 689 // Set value for synced device/host array 690 if (!w_impl->d_array && !w_impl->h_array) { 691 CeedCallBackend(CeedVectorSetValue(w, 0.0)); 692 } 693 if (w_impl->d_array) { 694 CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); 695 CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_DEVICE)); 696 CeedCallBackend(CeedDevicePointwiseMult_Cuda(w_impl->d_array, x_impl->d_array, y_impl->d_array, length)); 697 } 698 if (w_impl->h_array) { 699 CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); 700 CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_HOST)); 701 CeedCallBackend(CeedHostPointwiseMult_Cuda(w_impl->h_array, x_impl->h_array, y_impl->h_array, length)); 702 } 703 return CEED_ERROR_SUCCESS; 704 } 705 706 //------------------------------------------------------------------------------ 707 // Destroy the vector 708 //------------------------------------------------------------------------------ 709 static int CeedVectorDestroy_Cuda(const CeedVector vec) { 710 CeedVector_Cuda *impl; 711 712 CeedCallBackend(CeedVectorGetData(vec, &impl)); 713 CeedCallCuda(CeedVectorReturnCeed(vec), cudaFree(impl->d_array_owned)); 714 CeedCallBackend(CeedFree(&impl->h_array_owned)); 715 CeedCallBackend(CeedFree(&impl)); 716 return CEED_ERROR_SUCCESS; 717 } 718 719 //------------------------------------------------------------------------------ 720 // Create a vector of the specified length (does not allocate memory) 721 //------------------------------------------------------------------------------ 722 int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) { 723 CeedVector_Cuda *impl; 724 Ceed ceed; 725 726 CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); 727 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Cuda)); 728 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Cuda)); 729 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Cuda)); 730 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Cuda)); 731 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Cuda)); 732 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Cuda)); 733 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Cuda)); 734 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Cuda)); 735 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Cuda)); 736 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Cuda)); 737 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Cuda)); 738 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())CeedVectorScale_Cuda)); 739 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())CeedVectorAXPY_Cuda)); 740 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Cuda)); 741 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Cuda)); 742 CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Cuda)); 743 CeedCallBackend(CeedCalloc(1, &impl)); 744 CeedCallBackend(CeedVectorSetData(vec, impl)); 745 return CEED_ERROR_SUCCESS; 746 } 747 748 //------------------------------------------------------------------------------ 749