xref: /petsc/src/vec/is/sf/tests/ex2.c (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
1 static char help[] = "Test SF cuda stream synchronization in device to host communication\n\n";
2 /*
3   SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
4   operations in the default stream and does not sync these operations since it assumes routines consume
5   the destination data are also on the default stream. However, when destination data in on CPU,
6   SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd().
7  */
8 
9 #include <petscvec.h>
10 int main(int argc, char **argv)
11 {
12   PetscInt           i, n = 100000; /* Big enough to make the asynchronous copy meaningful */
13   PetscScalar       *val;
14   const PetscScalar *yval;
15   Vec                x, y;
16   PetscMPIInt        size;
17   IS                 ix, iy;
18   VecScatter         vscat;
19 
20   PetscFunctionBegin;
21   PetscFunctionBeginUser;
22   PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
23   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
24   PetscCheck(size == 1, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "This is a uni-processor test");
25 
26   /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
27      since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
28      cudaMemcpyDeviceToHost.
29    */
30   PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &x));
31   PetscCall(VecSetFromOptions(x));
32   PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &y));
33   PetscCall(VecSetFromOptions(y));
34 
35   /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
36   PetscCall(VecGetArray(x, &val));
37   for (i = 0; i < n; i++) val[i] = i / 2.0;
38   PetscCall(VecRestoreArray(x, &val));
39   PetscCall(VecScale(x, 2.0));
40   PetscCall(VecSet(y, 314));
41 
42   /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
43   PetscCall(VecGetArray(y, &val));
44   PetscCall(VecRestoreArray(y, &val));
45 
46   /* The vscat is simply a vector copy */
47   PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &ix));
48   PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iy));
49   PetscCall(VecScatterCreate(x, ix, y, iy, &vscat));
50 
51   /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
52      cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed.
53    */
54   PetscCall(VecScatterBegin(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
55   PetscCall(VecScatterEnd(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
56   PetscCall(VecGetArrayRead(y, &yval));
57   /* Display the first and the last entries of y to see if it is valid on host */
58   PetscCall(PetscPrintf(PETSC_COMM_SELF, "y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n", (float)PetscRealPart(yval[0]), n - 1, (float)PetscRealPart(yval[n - 1])));
59   PetscCall(VecRestoreArrayRead(y, &yval));
60 
61   PetscCall(VecDestroy(&x));
62   PetscCall(VecDestroy(&y));
63   PetscCall(ISDestroy(&ix));
64   PetscCall(ISDestroy(&iy));
65   PetscCall(VecScatterDestroy(&vscat));
66   PetscCall(PetscFinalize());
67   return 0;
68 }
69 
70 /*TEST
71 
72    test:
73     requires: cuda
74     diff_args: -j
75     #make sure the host memory is pinned
76     # sf_backend cuda is not needed if compiling only with cuda
77     args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0
78 
79    test:
80     suffix: hip
81     requires: hip
82     diff_args: -j
83     output_file: output/ex2_1.out
84     #make sure the host memory is pinned
85     # sf_backend hip is not needed if compiling only with hip
86     args:  -vec_type hip -sf_backend hip -vec_pinned_memory_min 0
87 
88 TEST*/
89