1 static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n"; 2 /* 3 SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous 4 operations in the default stream and does not sync these operations since it assumes routines consume 5 the destination data are also on the default stream. However, when destination data in on CPU, 6 SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd(). 7 */ 8 9 #include <petscvec.h> 10 int main(int argc,char **argv) 11 { 12 PetscInt i,n=100000; /* Big enough to make the asynchronous copy meaningful */ 13 PetscScalar *val; 14 const PetscScalar *yval; 15 Vec x,y; 16 PetscMPIInt size; 17 IS ix,iy; 18 VecScatter vscat; 19 20 PetscFunctionBegin; 21 PetscCall(PetscInitialize(&argc,&argv,(char*)0,help)); 22 PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size)); 23 PetscCheck(size == 1,PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test"); 24 25 /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector, 26 since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous 27 cudaMemcpyDeviceToHost. 28 */ 29 PetscCall(VecCreateSeq(PETSC_COMM_WORLD,n,&x)); 30 PetscCall(VecSetFromOptions(x)); 31 PetscCall(VecCreateSeq(PETSC_COMM_WORLD,n,&y)); 32 PetscCall(VecSetFromOptions(y)); 33 34 /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */ 35 PetscCall(VecGetArray(x,&val)); 36 for (i=0; i<n; i++) val[i] = i/2.0; 37 PetscCall(VecRestoreArray(x,&val)); 38 PetscCall(VecScale(x,2.0)); 39 PetscCall(VecSet(y,314)); 40 41 /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */ 42 PetscCall(VecGetArray(y,&val)); 43 PetscCall(VecRestoreArray(y,&val)); 44 45 /* The vscat is simply a vector copy */ 46 PetscCall(ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix)); 47 PetscCall(ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy)); 48 PetscCall(VecScatterCreate(x,ix,y,iy,&vscat)); 49 50 /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous 51 cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed. 52 */ 53 PetscCall(VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD)); 54 PetscCall(VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD)); 55 PetscCall(VecGetArrayRead(y,&yval)); 56 /* Display the first and the last entries of y to see if it is valid on host */ 57 PetscCall(PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]))); 58 PetscCall(VecRestoreArrayRead(y,&yval)); 59 60 PetscCall(VecDestroy(&x)); 61 PetscCall(VecDestroy(&y)); 62 PetscCall(ISDestroy(&ix)); 63 PetscCall(ISDestroy(&iy)); 64 PetscCall(VecScatterDestroy(&vscat)); 65 PetscCall(PetscFinalize()); 66 return 0; 67 } 68 69 /*TEST 70 71 test: 72 requires: cuda 73 diff_args: -j 74 #make sure the host memory is pinned 75 # sf_backend cuda is not needed if compiling only with cuda 76 args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0 77 78 test: 79 suffix: hip 80 requires: hip 81 diff_args: -j 82 output_file: output/ex2_1.out 83 #make sure the host memory is pinned 84 # sf_backend hip is not needed if compiling only with hip 85 args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0 86 87 TEST*/ 88