1 static char help[] = "Test SF cuda stream synchronization in device to host communication\n\n"; 2 /* 3 SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous 4 operations in the default stream and does not sync these operations since it assumes routines consume 5 the destination data are also on the default stream. However, when destination data in on CPU, 6 SF must guarantee the data is ready to use on CPU after PetscSFXxxEnd(). 7 */ 8 9 #include <petscvec.h> 10 int main(int argc, char **argv) 11 { 12 PetscInt i, n = 100000; /* Big enough to make the asynchronous copy meaningful */ 13 PetscScalar *val; 14 const PetscScalar *yval; 15 Vec x, y; 16 PetscMPIInt size; 17 IS ix, iy; 18 VecScatter vscat; 19 20 PetscFunctionBegin; 21 PetscFunctionBeginUser; 22 PetscCall(PetscInitialize(&argc, &argv, (char *)0, help)); 23 PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size)); 24 PetscCheck(size == 1, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "This is a uni-processor test"); 25 26 /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector, 27 since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous 28 cudaMemcpyDeviceToHost. 29 */ 30 PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &x)); 31 PetscCall(VecSetFromOptions(x)); 32 PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &y)); 33 PetscCall(VecSetFromOptions(y)); 34 35 /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */ 36 PetscCall(VecGetArray(x, &val)); 37 for (i = 0; i < n; i++) val[i] = i / 2.0; 38 PetscCall(VecRestoreArray(x, &val)); 39 PetscCall(VecScale(x, 2.0)); 40 PetscCall(VecSet(y, 314)); 41 42 /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */ 43 PetscCall(VecGetArray(y, &val)); 44 PetscCall(VecRestoreArray(y, &val)); 45 46 /* The vscat is simply a vector copy */ 47 PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &ix)); 48 PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iy)); 49 PetscCall(VecScatterCreate(x, ix, y, iy, &vscat)); 50 51 /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous 52 cudaMemcpy or kernels, but it must guarantee y is ready to use on host. Otherwise, wrong data will be displayed. 53 */ 54 PetscCall(VecScatterBegin(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD)); 55 PetscCall(VecScatterEnd(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD)); 56 PetscCall(VecGetArrayRead(y, &yval)); 57 /* Display the first and the last entries of y to see if it is valid on host */ 58 PetscCall(PetscPrintf(PETSC_COMM_SELF, "y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n", (float)PetscRealPart(yval[0]), n - 1, (float)PetscRealPart(yval[n - 1]))); 59 PetscCall(VecRestoreArrayRead(y, &yval)); 60 61 PetscCall(VecDestroy(&x)); 62 PetscCall(VecDestroy(&y)); 63 PetscCall(ISDestroy(&ix)); 64 PetscCall(ISDestroy(&iy)); 65 PetscCall(VecScatterDestroy(&vscat)); 66 PetscCall(PetscFinalize()); 67 return 0; 68 } 69 70 /*TEST 71 72 test: 73 requires: cuda 74 diff_args: -j 75 #make sure the host memory is pinned 76 # sf_backend cuda is not needed if compiling only with cuda 77 args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0 78 79 test: 80 suffix: hip 81 requires: hip 82 diff_args: -j 83 output_file: output/ex2_1.out 84 #make sure the host memory is pinned 85 # sf_backend hip is not needed if compiling only with hip 86 args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0 87 88 TEST*/ 89