static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n"; /* SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous operations in the default stream and does not sync these operations since it assumes routines consume the destination data are also on the default stream. However, when destination data in on CPU, SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd(). */ #include int main(int argc,char **argv) { PetscErrorCode ierr; PetscInt i,n=100000; /* Big enough to make the asynchronous copy meaningful */ PetscScalar *val; const PetscScalar *yval; Vec x,y; PetscMPIInt size; IS ix,iy; VecScatter vscat; PetscFunctionBegin; ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr; ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRMPI(ierr); if (size != 1) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test\n"); /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector, since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous cudaMemcpyDeviceToHost. */ ierr = VecCreateSeq(PETSC_COMM_WORLD,n,&x);CHKERRQ(ierr); ierr = VecSetFromOptions(x);CHKERRQ(ierr); ierr = VecCreateSeq(PETSC_COMM_WORLD,n,&y);CHKERRQ(ierr); ierr = VecSetFromOptions(y);CHKERRQ(ierr); /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */ ierr = VecGetArray(x,&val);CHKERRQ(ierr); for (i=0; i