1 static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n"; 2 /* 3 SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous 4 operations in the default stream and does not sync these operations since it assumes routines consume 5 the destination data are also on the default stream. However, when destination data in on CPU, 6 SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd(). 7 */ 8 9 #include <petscvec.h> 10 int main(int argc,char **argv) 11 { 12 PetscErrorCode ierr; 13 PetscInt i,n=100000; /* Big enough to make the asynchronous copy meaningful */ 14 PetscScalar *val; 15 const PetscScalar *yval; 16 Vec x,y; 17 PetscMPIInt size; 18 IS ix,iy; 19 VecScatter vscat; 20 21 PetscFunctionBegin; 22 ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr; 23 CHKERRMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size)); 24 PetscCheckFalse(size != 1,PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test"); 25 26 /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector, 27 since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous 28 cudaMemcpyDeviceToHost. 29 */ 30 CHKERRQ(VecCreateSeq(PETSC_COMM_WORLD,n,&x)); 31 CHKERRQ(VecSetFromOptions(x)); 32 CHKERRQ(VecCreateSeq(PETSC_COMM_WORLD,n,&y)); 33 CHKERRQ(VecSetFromOptions(y)); 34 35 /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */ 36 CHKERRQ(VecGetArray(x,&val)); 37 for (i=0; i<n; i++) val[i] = i/2.0; 38 CHKERRQ(VecRestoreArray(x,&val)); 39 CHKERRQ(VecScale(x,2.0)); 40 CHKERRQ(VecSet(y,314)); 41 42 /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */ 43 CHKERRQ(VecGetArray(y,&val)); 44 CHKERRQ(VecRestoreArray(y,&val)); 45 46 /* The vscat is simply a vector copy */ 47 CHKERRQ(ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix)); 48 CHKERRQ(ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy)); 49 CHKERRQ(VecScatterCreate(x,ix,y,iy,&vscat)); 50 51 /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous 52 cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed. 53 */ 54 CHKERRQ(VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD)); 55 CHKERRQ(VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD)); 56 CHKERRQ(VecGetArrayRead(y,&yval)); 57 /* Display the first and the last entries of y to see if it is valid on host */ 58 CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]))); 59 CHKERRQ(VecRestoreArrayRead(y,&yval)); 60 61 CHKERRQ(VecDestroy(&x)); 62 CHKERRQ(VecDestroy(&y)); 63 CHKERRQ(ISDestroy(&ix)); 64 CHKERRQ(ISDestroy(&iy)); 65 CHKERRQ(VecScatterDestroy(&vscat)); 66 ierr = PetscFinalize(); 67 return ierr; 68 } 69 70 /*TEST 71 72 test: 73 requires: cuda 74 diff_args: -j 75 #make sure the host memory is pinned 76 # sf_backend cuda is not needed if compiling only with cuda 77 args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0 78 79 test: 80 suffix: hip 81 requires: hip 82 diff_args: -j 83 output_file: output/ex2_1.out 84 #make sure the host memory is pinned 85 # sf_backend hip is not needed if compiling only with hip 86 args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0 87 88 TEST*/ 89