1 static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n"; 2 /* 3 SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous 4 operations in the default stream and does not sync these operations since it assumes routines consume 5 the destination data are also on the default stream. However, when destination data in on CPU, 6 SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd(). 7 */ 8 9 #include <petscvec.h> 10 int main(int argc,char **argv) 11 { 12 PetscErrorCode ierr; 13 PetscInt i,n=100000; /* Big enough to make the asynchronous copy meaningful */ 14 PetscScalar *val; 15 const PetscScalar *yval; 16 Vec x,y; 17 PetscMPIInt size; 18 IS ix,iy; 19 VecScatter vscat; 20 21 PetscFunctionBegin; 22 ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr; 23 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRMPI(ierr); 24 if (size != 1) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test\n"); 25 26 /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector, 27 since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous 28 cudaMemcpyDeviceToHost. 29 */ 30 ierr = VecCreateSeq(PETSC_COMM_WORLD,n,&x);CHKERRQ(ierr); 31 ierr = VecSetFromOptions(x);CHKERRQ(ierr); 32 ierr = VecCreateSeq(PETSC_COMM_WORLD,n,&y);CHKERRQ(ierr); 33 ierr = VecSetFromOptions(y);CHKERRQ(ierr); 34 35 /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */ 36 ierr = VecGetArray(x,&val);CHKERRQ(ierr); 37 for (i=0; i<n; i++) val[i] = i/2.0; 38 ierr = VecRestoreArray(x,&val);CHKERRQ(ierr); 39 ierr = VecScale(x,2.0);CHKERRQ(ierr); 40 ierr = VecSet(y,314);CHKERRQ(ierr); 41 42 /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */ 43 ierr = VecGetArray(y,&val);CHKERRQ(ierr); 44 ierr = VecRestoreArray(y,&val);CHKERRQ(ierr); 45 46 /* The vscat is simply a vector copy */ 47 ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix);CHKERRQ(ierr); 48 ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy);CHKERRQ(ierr); 49 ierr = VecScatterCreate(x,ix,y,iy,&vscat);CHKERRQ(ierr); 50 51 /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous 52 cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed. 53 */ 54 ierr = VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 55 ierr = VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 56 ierr = VecGetArrayRead(y,&yval);CHKERRQ(ierr); 57 /* Display the first and the last entries of y to see if it is valid on host */ 58 ierr = PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%D] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]));CHKERRQ(ierr); 59 ierr = VecRestoreArrayRead(y,&yval);CHKERRQ(ierr); 60 61 ierr = VecDestroy(&x);CHKERRQ(ierr); 62 ierr = VecDestroy(&y);CHKERRQ(ierr); 63 ierr = ISDestroy(&ix);CHKERRQ(ierr); 64 ierr = ISDestroy(&iy);CHKERRQ(ierr); 65 ierr = VecScatterDestroy(&vscat);CHKERRQ(ierr); 66 ierr = PetscFinalize(); 67 return ierr; 68 } 69 70 /*TEST 71 72 test: 73 requires: cuda 74 diff_args: -j 75 #make sure the host memory is pinned 76 # sf_backend cuda is not needed if compiling only with cuda 77 args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0 78 79 test: 80 suffix: hip 81 requires: hip 82 diff_args: -j 83 output_file: output/ex2_1.out 84 #make sure the host memory is pinned 85 # sf_backend hip is not needed if compiling only with hip 86 args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0 87 88 TEST*/ 89