xref: /petsc/src/vec/is/sf/tests/ex2.c (revision 58d68138c660dfb4e9f5b03334792cd4f2ffd7cc)
1 static char help[] = "Test SF cuda stream synchronization in device to host communication\n\n";
2 /*
3   SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
4   operations in the default stream and does not sync these operations since it assumes routines consume
5   the destination data are also on the default stream. However, when destination data in on CPU,
6   SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd().
7  */
8 
9 #include <petscvec.h>
10 int main(int argc, char **argv) {
11   PetscInt           i, n = 100000; /* Big enough to make the asynchronous copy meaningful */
12   PetscScalar       *val;
13   const PetscScalar *yval;
14   Vec                x, y;
15   PetscMPIInt        size;
16   IS                 ix, iy;
17   VecScatter         vscat;
18 
19   PetscFunctionBegin;
20   PetscFunctionBeginUser;
21   PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
22   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
23   PetscCheck(size == 1, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "This is a uni-processor test");
24 
25   /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
26      since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
27      cudaMemcpyDeviceToHost.
28    */
29   PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &x));
30   PetscCall(VecSetFromOptions(x));
31   PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &y));
32   PetscCall(VecSetFromOptions(y));
33 
34   /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
35   PetscCall(VecGetArray(x, &val));
36   for (i = 0; i < n; i++) val[i] = i / 2.0;
37   PetscCall(VecRestoreArray(x, &val));
38   PetscCall(VecScale(x, 2.0));
39   PetscCall(VecSet(y, 314));
40 
41   /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
42   PetscCall(VecGetArray(y, &val));
43   PetscCall(VecRestoreArray(y, &val));
44 
45   /* The vscat is simply a vector copy */
46   PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &ix));
47   PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iy));
48   PetscCall(VecScatterCreate(x, ix, y, iy, &vscat));
49 
50   /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
51      cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed.
52    */
53   PetscCall(VecScatterBegin(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
54   PetscCall(VecScatterEnd(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
55   PetscCall(VecGetArrayRead(y, &yval));
56   /* Display the first and the last entries of y to see if it is valid on host */
57   PetscCall(PetscPrintf(PETSC_COMM_SELF, "y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n", (float)PetscRealPart(yval[0]), n - 1, (float)PetscRealPart(yval[n - 1])));
58   PetscCall(VecRestoreArrayRead(y, &yval));
59 
60   PetscCall(VecDestroy(&x));
61   PetscCall(VecDestroy(&y));
62   PetscCall(ISDestroy(&ix));
63   PetscCall(ISDestroy(&iy));
64   PetscCall(VecScatterDestroy(&vscat));
65   PetscCall(PetscFinalize());
66   return 0;
67 }
68 
69 /*TEST
70 
71    test:
72     requires: cuda
73     diff_args: -j
74     #make sure the host memory is pinned
75     # sf_backend cuda is not needed if compiling only with cuda
76     args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0
77 
78    test:
79     suffix: hip
80     requires: hip
81     diff_args: -j
82     output_file: output/ex2_1.out
83     #make sure the host memory is pinned
84     # sf_backend hip is not needed if compiling only with hip
85     args:  -vec_type hip -sf_backend hip -vec_pinned_memory_min 0
86 
87 TEST*/
88