static char help[] = "Test SF cuda stream synchronization in device to host communication\n\n"; /* SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous operations in the default stream and does not sync these operations since it assumes routines consume the destination data are also on the default stream. However, when destination data in on CPU, SF must guarantee the data is ready to use on CPU after PetscSFXxxEnd(). */ #include int main(int argc, char **argv) { PetscInt i, n = 100000; /* Big enough to make the asynchronous copy meaningful */ PetscScalar *val; const PetscScalar *yval; Vec x, y; PetscMPIInt size; IS ix, iy; VecScatter vscat; PetscFunctionBegin; PetscFunctionBeginUser; PetscCall(PetscInitialize(&argc, &argv, NULL, help)); PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size)); PetscCheck(size == 1, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "This is a uni-processor test"); /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector, since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous cudaMemcpyDeviceToHost. */ PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &x)); PetscCall(VecSetFromOptions(x)); PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &y)); PetscCall(VecSetFromOptions(y)); /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */ PetscCall(VecGetArray(x, &val)); for (i = 0; i < n; i++) val[i] = i / 2.0; PetscCall(VecRestoreArray(x, &val)); PetscCall(VecScale(x, 2.0)); PetscCall(VecSet(y, 314)); /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */ PetscCall(VecGetArray(y, &val)); PetscCall(VecRestoreArray(y, &val)); /* The vscat is simply a vector copy */ PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &ix)); PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iy)); PetscCall(VecScatterCreate(x, ix, y, iy, &vscat)); /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous cudaMemcpy or kernels, but it must guarantee y is ready to use on host. Otherwise, wrong data will be displayed. */ PetscCall(VecScatterBegin(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD)); PetscCall(VecScatterEnd(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD)); PetscCall(VecGetArrayRead(y, &yval)); /* Display the first and the last entries of y to see if it is valid on host */ PetscCall(PetscPrintf(PETSC_COMM_SELF, "y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n", (float)PetscRealPart(yval[0]), n - 1, (float)PetscRealPart(yval[n - 1]))); PetscCall(VecRestoreArrayRead(y, &yval)); PetscCall(VecDestroy(&x)); PetscCall(VecDestroy(&y)); PetscCall(ISDestroy(&ix)); PetscCall(ISDestroy(&iy)); PetscCall(VecScatterDestroy(&vscat)); PetscCall(PetscFinalize()); return 0; } /*TEST test: requires: cuda diff_args: -j #make sure the host memory is pinned # sf_backend cuda is not needed if compiling only with cuda args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0 test: suffix: hip requires: hip diff_args: -j output_file: output/ex2_1.out #make sure the host memory is pinned # sf_backend hip is not needed if compiling only with hip args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0 TEST*/