xref: /petsc/src/vec/is/sf/tests/ex2.c (revision efa12513287cff49a2b9648ae83199dcbfaad71a)
1 static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n";
2 /*
3   SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
4   operations in the default stream and does not sync these operations since it assumes routines consume
5   the destination data are also on the default stream. However, when destination data in on CPU,
6   SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd().
7  */
8 
9 #include <petscvec.h>
10 int main(int argc,char **argv)
11 {
12   PetscErrorCode     ierr;
13   PetscInt           i,n=100000; /* Big enough to make the asynchronous copy meaningful */
14   PetscScalar        *val;
15   const PetscScalar  *yval;
16   Vec                x,y;
17   PetscMPIInt        size;
18   IS                 ix,iy;
19   VecScatter         vscat;
20 
21   PetscFunctionBegin;
22   ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr;
23   ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRMPI(ierr);
24   if (size != 1) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test\n");
25 
26   /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
27      since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
28      cudaMemcpyDeviceToHost.
29    */
30   ierr = VecCreateSeq(PETSC_COMM_WORLD,n,&x);CHKERRQ(ierr);
31   ierr = VecSetFromOptions(x);CHKERRQ(ierr);
32   ierr = VecCreateSeq(PETSC_COMM_WORLD,n,&y);CHKERRQ(ierr);
33   ierr = VecSetFromOptions(y);CHKERRQ(ierr);
34 
35   /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
36   ierr = VecGetArray(x,&val);CHKERRQ(ierr);
37   for (i=0; i<n; i++) val[i] = i/2.0;
38   ierr = VecRestoreArray(x,&val);CHKERRQ(ierr);
39   ierr = VecScale(x,2.0);CHKERRQ(ierr);
40   ierr = VecSet(y,314);CHKERRQ(ierr);
41 
42   /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
43   ierr = VecGetArray(y,&val);
44   ierr = VecRestoreArray(y,&val);CHKERRQ(ierr);
45 
46   /* The vscat is simply a vector copy */
47   ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix);
48   ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy);
49   ierr = VecScatterCreate(x,ix,y,iy,&vscat);CHKERRQ(ierr);
50 
51   /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
52      cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed.
53    */
54   ierr = VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
55   ierr = VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
56   ierr = VecGetArrayRead(y,&yval);CHKERRQ(ierr);
57   /* Display the first and the last entries of y to see if it is valid on host */
58   ierr = PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%D] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]));CHKERRQ(ierr);
59   ierr = VecRestoreArrayRead(y,&yval);CHKERRQ(ierr);
60 
61   ierr = VecDestroy(&x);CHKERRQ(ierr);
62   ierr = VecDestroy(&y);CHKERRQ(ierr);
63   ierr = ISDestroy(&ix);CHKERRQ(ierr);
64   ierr = ISDestroy(&iy);CHKERRQ(ierr);
65   ierr = VecScatterDestroy(&vscat);CHKERRQ(ierr);
66   ierr = PetscFinalize();
67   return ierr;
68 }
69 
70 /*TEST
71 
72    test:
73     requires: cuda
74     #make sure the host memory is pinned
75     # sf_backend cuda is not needed if compiling only with cuda
76     args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0
77 
78    test:
79     suffix: hip
80     requires: hip
81     output_file: output/ex2_1.out
82     #make sure the host memory is pinned
83     # sf_backend hip is not needed if compiling only with hip
84     args:  -vec_type hip -sf_backend hip -vec_pinned_memory_min 0
85 
86 TEST*/
87