14a314419SJunchao Zhang static char help[] = "Test SF cuda stream synchronization in device to host communication\n\n";
24a314419SJunchao Zhang /*
34a314419SJunchao Zhang SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
44a314419SJunchao Zhang operations in the default stream and does not sync these operations since it assumes routines consume
54a314419SJunchao Zhang the destination data are also on the default stream. However, when destination data in on CPU,
6d5b43468SJose E. Roman SF must guarantee the data is ready to use on CPU after PetscSFXxxEnd().
74a314419SJunchao Zhang */
84a314419SJunchao Zhang
94a314419SJunchao Zhang #include <petscvec.h>
main(int argc,char ** argv)10d71ae5a4SJacob Faibussowitsch int main(int argc, char **argv)
11d71ae5a4SJacob Faibussowitsch {
124a314419SJunchao Zhang PetscInt i, n = 100000; /* Big enough to make the asynchronous copy meaningful */
134a314419SJunchao Zhang PetscScalar *val;
144a314419SJunchao Zhang const PetscScalar *yval;
154a314419SJunchao Zhang Vec x, y;
164a314419SJunchao Zhang PetscMPIInt size;
174a314419SJunchao Zhang IS ix, iy;
184a314419SJunchao Zhang VecScatter vscat;
194a314419SJunchao Zhang
204a314419SJunchao Zhang PetscFunctionBegin;
21327415f7SBarry Smith PetscFunctionBeginUser;
22*c8025a54SPierre Jolivet PetscCall(PetscInitialize(&argc, &argv, NULL, help));
239566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
2408401ef6SPierre Jolivet PetscCheck(size == 1, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "This is a uni-processor test");
254a314419SJunchao Zhang
264a314419SJunchao Zhang /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
274a314419SJunchao Zhang since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
284a314419SJunchao Zhang cudaMemcpyDeviceToHost.
294a314419SJunchao Zhang */
309566063dSJacob Faibussowitsch PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &x));
319566063dSJacob Faibussowitsch PetscCall(VecSetFromOptions(x));
329566063dSJacob Faibussowitsch PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &y));
339566063dSJacob Faibussowitsch PetscCall(VecSetFromOptions(y));
344a314419SJunchao Zhang
354a314419SJunchao Zhang /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
369566063dSJacob Faibussowitsch PetscCall(VecGetArray(x, &val));
374a314419SJunchao Zhang for (i = 0; i < n; i++) val[i] = i / 2.0;
389566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(x, &val));
399566063dSJacob Faibussowitsch PetscCall(VecScale(x, 2.0));
409566063dSJacob Faibussowitsch PetscCall(VecSet(y, 314));
414a314419SJunchao Zhang
424a314419SJunchao Zhang /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
439566063dSJacob Faibussowitsch PetscCall(VecGetArray(y, &val));
449566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(y, &val));
454a314419SJunchao Zhang
464a314419SJunchao Zhang /* The vscat is simply a vector copy */
479566063dSJacob Faibussowitsch PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &ix));
489566063dSJacob Faibussowitsch PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iy));
499566063dSJacob Faibussowitsch PetscCall(VecScatterCreate(x, ix, y, iy, &vscat));
504a314419SJunchao Zhang
514a314419SJunchao Zhang /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
52d5b43468SJose E. Roman cudaMemcpy or kernels, but it must guarantee y is ready to use on host. Otherwise, wrong data will be displayed.
534a314419SJunchao Zhang */
549566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
559566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
569566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(y, &yval));
574a314419SJunchao Zhang /* Display the first and the last entries of y to see if it is valid on host */
589566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n", (float)PetscRealPart(yval[0]), n - 1, (float)PetscRealPart(yval[n - 1])));
599566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(y, &yval));
604a314419SJunchao Zhang
619566063dSJacob Faibussowitsch PetscCall(VecDestroy(&x));
629566063dSJacob Faibussowitsch PetscCall(VecDestroy(&y));
639566063dSJacob Faibussowitsch PetscCall(ISDestroy(&ix));
649566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iy));
659566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&vscat));
669566063dSJacob Faibussowitsch PetscCall(PetscFinalize());
67b122ec5aSJacob Faibussowitsch return 0;
684a314419SJunchao Zhang }
694a314419SJunchao Zhang
704a314419SJunchao Zhang /*TEST
714a314419SJunchao Zhang
724a314419SJunchao Zhang test:
734a314419SJunchao Zhang requires: cuda
74328e583dSStefano Zampini diff_args: -j
754a314419SJunchao Zhang #make sure the host memory is pinned
7626e8e884SScott Kruger # sf_backend cuda is not needed if compiling only with cuda
7726e8e884SScott Kruger args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0
7826e8e884SScott Kruger
7926e8e884SScott Kruger test:
8026e8e884SScott Kruger suffix: hip
8126e8e884SScott Kruger requires: hip
82328e583dSStefano Zampini diff_args: -j
8326e8e884SScott Kruger output_file: output/ex2_1.out
8426e8e884SScott Kruger #make sure the host memory is pinned
8526e8e884SScott Kruger # sf_backend hip is not needed if compiling only with hip
8626e8e884SScott Kruger args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0
874a314419SJunchao Zhang
884a314419SJunchao Zhang TEST*/
89