1a90d8e38SSatish Balay static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";
2a90d8e38SSatish Balay
3a90d8e38SSatish Balay /*
4a90d8e38SSatish Balay This is a simple test to measure the latency of MPI communication.
5a90d8e38SSatish Balay The test is run with two processes. The first process sends a message
6a90d8e38SSatish Balay to the second process, and after having received the message, the second
7a90d8e38SSatish Balay process sends a message back to the first process once. The is repeated
8a90d8e38SSatish Balay a number of times. The latency is defined as half time of the round-trip.
9a90d8e38SSatish Balay
10a90d8e38SSatish Balay It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
11a90d8e38SSatish Balay
12*a8cf87e0SJunchao Zhang Usage: mpiexec -n 2 ./ex4k -mtype <type>
13a90d8e38SSatish Balay Other arguments have a default value that is also used in osu_latency.
14a90d8e38SSatish Balay
15a90d8e38SSatish Balay Examples:
16a90d8e38SSatish Balay
17a90d8e38SSatish Balay On Summit at OLCF:
18a90d8e38SSatish Balay jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex4k -mtype kokkos
19a90d8e38SSatish Balay
20a90d8e38SSatish Balay On Crusher at OLCF:
21a90d8e38SSatish Balay srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex4k -mtype kokkos
22a90d8e38SSatish Balay */
23a90d8e38SSatish Balay #include <petscsf.h>
24a90d8e38SSatish Balay #include <Kokkos_Core.hpp>
25a90d8e38SSatish Balay
26a90d8e38SSatish Balay /* Same values as OSU microbenchmarks */
27a90d8e38SSatish Balay #define LAT_LOOP_SMALL 10000
28a90d8e38SSatish Balay #define LAT_SKIP_SMALL 100
29a90d8e38SSatish Balay #define LAT_LOOP_LARGE 1000
30a90d8e38SSatish Balay #define LAT_SKIP_LARGE 10
31a90d8e38SSatish Balay #define LARGE_MESSAGE_SIZE 8192
32a90d8e38SSatish Balay
main(int argc,char ** argv)33a90d8e38SSatish Balay int main(int argc, char **argv)
34a90d8e38SSatish Balay {
35a90d8e38SSatish Balay PetscSF sf[64];
36a90d8e38SSatish Balay PetscLogDouble t_start = 0, t_end = 0, time[64];
37a90d8e38SSatish Balay PetscInt i, j, n, nroots, nleaves, niter = 100, nskip = 10;
38a90d8e38SSatish Balay PetscInt maxn = 512 * 1024; /* max 4M bytes messages */
39a90d8e38SSatish Balay PetscSFNode *iremote;
40a90d8e38SSatish Balay PetscMPIInt rank, size;
41a90d8e38SSatish Balay PetscScalar *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
42a90d8e38SSatish Balay size_t msgsize;
43a90d8e38SSatish Balay PetscMemType mtype = PETSC_MEMTYPE_HOST;
44a90d8e38SSatish Balay char mstring[16] = {0};
45a90d8e38SSatish Balay PetscBool set;
46a90d8e38SSatish Balay PetscInt skipSmall = -1, loopSmall = -1;
47a90d8e38SSatish Balay MPI_Op op = MPI_REPLACE;
48a90d8e38SSatish Balay
49a90d8e38SSatish Balay PetscFunctionBeginUser;
50a90d8e38SSatish Balay Kokkos::initialize(argc, argv); // Test initializing kokkos before petsc
51a90d8e38SSatish Balay PetscCall(PetscInitialize(&argc, &argv, NULL, help));
52a90d8e38SSatish Balay PetscCall(PetscKokkosInitializeCheck());
53a90d8e38SSatish Balay
54a90d8e38SSatish Balay PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
55a90d8e38SSatish Balay PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
56a90d8e38SSatish Balay PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes");
57a90d8e38SSatish Balay
58a90d8e38SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */
59a90d8e38SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL));
60a90d8e38SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL));
61a90d8e38SSatish Balay
62a90d8e38SSatish Balay PetscCall(PetscMalloc1(maxn, &iremote));
63a90d8e38SSatish Balay PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set));
64a90d8e38SSatish Balay if (set) {
65a90d8e38SSatish Balay PetscBool isHost, isKokkos;
66a90d8e38SSatish Balay PetscCall(PetscStrcasecmp(mstring, "host", &isHost));
67a90d8e38SSatish Balay PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos));
68a90d8e38SSatish Balay if (isHost) mtype = PETSC_MEMTYPE_HOST;
69a90d8e38SSatish Balay else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS;
70a90d8e38SSatish Balay else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
71a90d8e38SSatish Balay }
72a90d8e38SSatish Balay
73a90d8e38SSatish Balay if (mtype == PETSC_MEMTYPE_HOST) {
74a90d8e38SSatish Balay PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata));
75a90d8e38SSatish Balay } else {
76a90d8e38SSatish Balay PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
77a90d8e38SSatish Balay PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
78a90d8e38SSatish Balay }
79a90d8e38SSatish Balay PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf));
80a90d8e38SSatish Balay for (i = 0; i < maxn; i++) {
81a90d8e38SSatish Balay pbuf[i] = 123.0;
82a90d8e38SSatish Balay ebuf[i] = 456.0;
83a90d8e38SSatish Balay }
84a90d8e38SSatish Balay
85a90d8e38SSatish Balay for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
86a90d8e38SSatish Balay PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i]));
87a90d8e38SSatish Balay PetscCall(PetscSFSetFromOptions(sf[i]));
88a90d8e38SSatish Balay if (rank == 0) {
89a90d8e38SSatish Balay nroots = n;
90a90d8e38SSatish Balay nleaves = 0;
91a90d8e38SSatish Balay } else {
92a90d8e38SSatish Balay nroots = 0;
93a90d8e38SSatish Balay nleaves = n;
94a90d8e38SSatish Balay for (j = 0; j < nleaves; j++) {
95a90d8e38SSatish Balay iremote[j].rank = 0;
96a90d8e38SSatish Balay iremote[j].index = j;
97a90d8e38SSatish Balay }
98a90d8e38SSatish Balay }
99a90d8e38SSatish Balay PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES));
100a90d8e38SSatish Balay PetscCall(PetscSFSetUp(sf[i]));
101a90d8e38SSatish Balay }
102a90d8e38SSatish Balay
103a90d8e38SSatish Balay if (loopSmall > 0) {
104a90d8e38SSatish Balay nskip = skipSmall;
105a90d8e38SSatish Balay niter = loopSmall;
106a90d8e38SSatish Balay } else {
107a90d8e38SSatish Balay nskip = LAT_SKIP_SMALL;
108a90d8e38SSatish Balay niter = LAT_LOOP_SMALL;
109a90d8e38SSatish Balay }
110a90d8e38SSatish Balay
111a90d8e38SSatish Balay for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
112a90d8e38SSatish Balay msgsize = sizeof(PetscScalar) * n;
113a90d8e38SSatish Balay if (mtype == PETSC_MEMTYPE_HOST) {
114a90d8e38SSatish Balay PetscCall(PetscArraycpy(rootdata, pbuf, n));
115a90d8e38SSatish Balay PetscCall(PetscArraycpy(leafdata, ebuf, n));
116a90d8e38SSatish Balay } else {
117a90d8e38SSatish Balay Kokkos::View<PetscScalar *> dst1((PetscScalar *)rootdata, n);
118a90d8e38SSatish Balay Kokkos::View<PetscScalar *> dst2((PetscScalar *)leafdata, n);
119a90d8e38SSatish Balay Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n);
120a90d8e38SSatish Balay Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n);
121a90d8e38SSatish Balay PetscCallCXX(Kokkos::deep_copy(dst1, src1));
122a90d8e38SSatish Balay PetscCallCXX(Kokkos::deep_copy(dst2, src2));
123a90d8e38SSatish Balay }
124a90d8e38SSatish Balay
125a90d8e38SSatish Balay if (msgsize > LARGE_MESSAGE_SIZE) {
126a90d8e38SSatish Balay nskip = LAT_SKIP_LARGE;
127a90d8e38SSatish Balay niter = LAT_LOOP_LARGE;
128a90d8e38SSatish Balay }
129a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
130a90d8e38SSatish Balay
131a90d8e38SSatish Balay for (i = 0; i < niter + nskip; i++) {
132a90d8e38SSatish Balay if (i == nskip) {
133a90d8e38SSatish Balay PetscCallCXX(Kokkos::fence());
134a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
135a90d8e38SSatish Balay t_start = MPI_Wtime();
136a90d8e38SSatish Balay }
137a90d8e38SSatish Balay PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op));
138a90d8e38SSatish Balay PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op));
139a90d8e38SSatish Balay PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op));
140a90d8e38SSatish Balay PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op));
141a90d8e38SSatish Balay }
142a90d8e38SSatish Balay PetscCallCXX(Kokkos::fence());
143a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
144a90d8e38SSatish Balay t_end = MPI_Wtime();
145a90d8e38SSatish Balay time[j] = (t_end - t_start) * 1e6 / (niter * 2);
146a90d8e38SSatish Balay }
147a90d8e38SSatish Balay
148a90d8e38SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device"));
149a90d8e38SSatish Balay for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
150a90d8e38SSatish Balay PetscCall(PetscSFDestroy(&sf[j]));
151a90d8e38SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]));
152a90d8e38SSatish Balay }
153a90d8e38SSatish Balay PetscCall(PetscFree2(pbuf, ebuf));
154a90d8e38SSatish Balay if (mtype == PETSC_MEMTYPE_HOST) {
155a90d8e38SSatish Balay PetscCall(PetscFree2(rootdata, leafdata));
156a90d8e38SSatish Balay } else {
157a90d8e38SSatish Balay PetscCallCXX(Kokkos::kokkos_free(rootdata));
158a90d8e38SSatish Balay PetscCallCXX(Kokkos::kokkos_free(leafdata));
159a90d8e38SSatish Balay }
160a90d8e38SSatish Balay PetscCall(PetscFree(iremote));
161a90d8e38SSatish Balay PetscCall(PetscFinalize());
162a90d8e38SSatish Balay Kokkos::finalize();
163a90d8e38SSatish Balay return 0;
164a90d8e38SSatish Balay }
165a90d8e38SSatish Balay
166a90d8e38SSatish Balay /*TEST
167a90d8e38SSatish Balay testset:
168a90d8e38SSatish Balay requires: kokkos
169a90d8e38SSatish Balay # use small numbers to make the test cheap
170a90d8e38SSatish Balay args: -maxn 4 -skipSmall 1 -loopSmall 1
171a90d8e38SSatish Balay filter: grep "DOES_NOT_EXIST"
172a90d8e38SSatish Balay output_file: output/empty.out
173a90d8e38SSatish Balay nsize: 2
174a90d8e38SSatish Balay
175a90d8e38SSatish Balay test:
176a90d8e38SSatish Balay args: -mtype {{host kokkos}}
177a90d8e38SSatish Balay
178a90d8e38SSatish Balay test:
179a90d8e38SSatish Balay requires: cuda mpi_gpu_aware mpix_stream
180a90d8e38SSatish Balay suffix: mpix
181a90d8e38SSatish Balay # MPICH doesn't reserve VCI, and per MPICH developers only 1 VCI is needed for GPU
182a90d8e38SSatish Balay env: MPIR_CVAR_CH4_RESERVE_VCIS=1
183a90d8e38SSatish Balay args: -mtype kokkos -sf_use_stream_aware_mpi 1
184a90d8e38SSatish Balay
185a90d8e38SSatish Balay TEST*/
186