1 static char help[] = "Test SF cuda stream synchronization in device to host communication\n\n";
2 /*
3 SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
4 operations in the default stream and does not sync these operations since it assumes routines consume
5 the destination data are also on the default stream. However, when destination data in on CPU,
6 SF must guarantee the data is ready to use on CPU after PetscSFXxxEnd().
7 */
8
9 #include <petscvec.h>
main(int argc,char ** argv)10 int main(int argc, char **argv)
11 {
12 PetscInt i, n = 100000; /* Big enough to make the asynchronous copy meaningful */
13 PetscScalar *val;
14 const PetscScalar *yval;
15 Vec x, y;
16 PetscMPIInt size;
17 IS ix, iy;
18 VecScatter vscat;
19
20 PetscFunctionBegin;
21 PetscFunctionBeginUser;
22 PetscCall(PetscInitialize(&argc, &argv, NULL, help));
23 PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
24 PetscCheck(size == 1, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "This is a uni-processor test");
25
26 /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
27 since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
28 cudaMemcpyDeviceToHost.
29 */
30 PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &x));
31 PetscCall(VecSetFromOptions(x));
32 PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &y));
33 PetscCall(VecSetFromOptions(y));
34
35 /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
36 PetscCall(VecGetArray(x, &val));
37 for (i = 0; i < n; i++) val[i] = i / 2.0;
38 PetscCall(VecRestoreArray(x, &val));
39 PetscCall(VecScale(x, 2.0));
40 PetscCall(VecSet(y, 314));
41
42 /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
43 PetscCall(VecGetArray(y, &val));
44 PetscCall(VecRestoreArray(y, &val));
45
46 /* The vscat is simply a vector copy */
47 PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &ix));
48 PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iy));
49 PetscCall(VecScatterCreate(x, ix, y, iy, &vscat));
50
51 /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
52 cudaMemcpy or kernels, but it must guarantee y is ready to use on host. Otherwise, wrong data will be displayed.
53 */
54 PetscCall(VecScatterBegin(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
55 PetscCall(VecScatterEnd(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
56 PetscCall(VecGetArrayRead(y, &yval));
57 /* Display the first and the last entries of y to see if it is valid on host */
58 PetscCall(PetscPrintf(PETSC_COMM_SELF, "y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n", (float)PetscRealPart(yval[0]), n - 1, (float)PetscRealPart(yval[n - 1])));
59 PetscCall(VecRestoreArrayRead(y, &yval));
60
61 PetscCall(VecDestroy(&x));
62 PetscCall(VecDestroy(&y));
63 PetscCall(ISDestroy(&ix));
64 PetscCall(ISDestroy(&iy));
65 PetscCall(VecScatterDestroy(&vscat));
66 PetscCall(PetscFinalize());
67 return 0;
68 }
69
70 /*TEST
71
72 test:
73 requires: cuda
74 diff_args: -j
75 #make sure the host memory is pinned
76 # sf_backend cuda is not needed if compiling only with cuda
77 args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0
78
79 test:
80 suffix: hip
81 requires: hip
82 diff_args: -j
83 output_file: output/ex2_1.out
84 #make sure the host memory is pinned
85 # sf_backend hip is not needed if compiling only with hip
86 args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0
87
88 TEST*/
89