xref: /petsc/src/vec/is/sf/tests/ex2.c (revision 8fb5bd83c3955fefcf33a54e3bb66920a9fa884b)
1 static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n";
2 /*
3   SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
4   operations in the default stream and does not sync these operations since it assumes routines consume
5   the destination data are also on the default stream. However, when destination data in on CPU,
6   SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd().
7  */
8 
9 #include <petscvec.h>
10 int main(int argc,char **argv)
11 {
12   PetscInt           i,n=100000; /* Big enough to make the asynchronous copy meaningful */
13   PetscScalar        *val;
14   const PetscScalar  *yval;
15   Vec                x,y;
16   PetscMPIInt        size;
17   IS                 ix,iy;
18   VecScatter         vscat;
19 
20   PetscFunctionBegin;
21   PetscCall(PetscInitialize(&argc,&argv,(char*)0,help));
22   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size));
23   PetscCheck(size == 1,PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test");
24 
25   /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
26      since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
27      cudaMemcpyDeviceToHost.
28    */
29   PetscCall(VecCreateSeq(PETSC_COMM_WORLD,n,&x));
30   PetscCall(VecSetFromOptions(x));
31   PetscCall(VecCreateSeq(PETSC_COMM_WORLD,n,&y));
32   PetscCall(VecSetFromOptions(y));
33 
34   /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
35   PetscCall(VecGetArray(x,&val));
36   for (i=0; i<n; i++) val[i] = i/2.0;
37   PetscCall(VecRestoreArray(x,&val));
38   PetscCall(VecScale(x,2.0));
39   PetscCall(VecSet(y,314));
40 
41   /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
42   PetscCall(VecGetArray(y,&val));
43   PetscCall(VecRestoreArray(y,&val));
44 
45   /* The vscat is simply a vector copy */
46   PetscCall(ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix));
47   PetscCall(ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy));
48   PetscCall(VecScatterCreate(x,ix,y,iy,&vscat));
49 
50   /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
51      cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed.
52    */
53   PetscCall(VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
54   PetscCall(VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
55   PetscCall(VecGetArrayRead(y,&yval));
56   /* Display the first and the last entries of y to see if it is valid on host */
57   PetscCall(PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1])));
58   PetscCall(VecRestoreArrayRead(y,&yval));
59 
60   PetscCall(VecDestroy(&x));
61   PetscCall(VecDestroy(&y));
62   PetscCall(ISDestroy(&ix));
63   PetscCall(ISDestroy(&iy));
64   PetscCall(VecScatterDestroy(&vscat));
65   PetscCall(PetscFinalize());
66   return 0;
67 }
68 
69 /*TEST
70 
71    test:
72     requires: cuda
73     diff_args: -j
74     #make sure the host memory is pinned
75     # sf_backend cuda is not needed if compiling only with cuda
76     args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0
77 
78    test:
79     suffix: hip
80     requires: hip
81     diff_args: -j
82     output_file: output/ex2_1.out
83     #make sure the host memory is pinned
84     # sf_backend hip is not needed if compiling only with hip
85     args:  -vec_type hip -sf_backend hip -vec_pinned_memory_min 0
86 
87 TEST*/
88