1 static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";
2
3 /*
4 This is a simple test to measure the latency of MPI communication.
5 The test is run with two processes. The first process sends a message
6 to the second process, and after having received the message, the second
7 process sends a message back to the first process once. The is repeated
8 a number of times. The latency is defined as half time of the round-trip.
9
10 It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
11
12 Usage: mpiexec -n 2 ./ex4k -mtype <type>
13 Other arguments have a default value that is also used in osu_latency.
14
15 Examples:
16
17 On Summit at OLCF:
18 jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex4k -mtype kokkos
19
20 On Crusher at OLCF:
21 srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex4k -mtype kokkos
22 */
23 #include <petscsf.h>
24 #include <Kokkos_Core.hpp>
25
26 /* Same values as OSU microbenchmarks */
27 #define LAT_LOOP_SMALL 10000
28 #define LAT_SKIP_SMALL 100
29 #define LAT_LOOP_LARGE 1000
30 #define LAT_SKIP_LARGE 10
31 #define LARGE_MESSAGE_SIZE 8192
32
main(int argc,char ** argv)33 int main(int argc, char **argv)
34 {
35 PetscSF sf[64];
36 PetscLogDouble t_start = 0, t_end = 0, time[64];
37 PetscInt i, j, n, nroots, nleaves, niter = 100, nskip = 10;
38 PetscInt maxn = 512 * 1024; /* max 4M bytes messages */
39 PetscSFNode *iremote;
40 PetscMPIInt rank, size;
41 PetscScalar *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
42 size_t msgsize;
43 PetscMemType mtype = PETSC_MEMTYPE_HOST;
44 char mstring[16] = {0};
45 PetscBool set;
46 PetscInt skipSmall = -1, loopSmall = -1;
47 MPI_Op op = MPI_REPLACE;
48
49 PetscFunctionBeginUser;
50 Kokkos::initialize(argc, argv); // Test initializing kokkos before petsc
51 PetscCall(PetscInitialize(&argc, &argv, NULL, help));
52 PetscCall(PetscKokkosInitializeCheck());
53
54 PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
55 PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
56 PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes");
57
58 PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */
59 PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL));
60 PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL));
61
62 PetscCall(PetscMalloc1(maxn, &iremote));
63 PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set));
64 if (set) {
65 PetscBool isHost, isKokkos;
66 PetscCall(PetscStrcasecmp(mstring, "host", &isHost));
67 PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos));
68 if (isHost) mtype = PETSC_MEMTYPE_HOST;
69 else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS;
70 else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
71 }
72
73 if (mtype == PETSC_MEMTYPE_HOST) {
74 PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata));
75 } else {
76 PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
77 PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
78 }
79 PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf));
80 for (i = 0; i < maxn; i++) {
81 pbuf[i] = 123.0;
82 ebuf[i] = 456.0;
83 }
84
85 for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
86 PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i]));
87 PetscCall(PetscSFSetFromOptions(sf[i]));
88 if (rank == 0) {
89 nroots = n;
90 nleaves = 0;
91 } else {
92 nroots = 0;
93 nleaves = n;
94 for (j = 0; j < nleaves; j++) {
95 iremote[j].rank = 0;
96 iremote[j].index = j;
97 }
98 }
99 PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES));
100 PetscCall(PetscSFSetUp(sf[i]));
101 }
102
103 if (loopSmall > 0) {
104 nskip = skipSmall;
105 niter = loopSmall;
106 } else {
107 nskip = LAT_SKIP_SMALL;
108 niter = LAT_LOOP_SMALL;
109 }
110
111 for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
112 msgsize = sizeof(PetscScalar) * n;
113 if (mtype == PETSC_MEMTYPE_HOST) {
114 PetscCall(PetscArraycpy(rootdata, pbuf, n));
115 PetscCall(PetscArraycpy(leafdata, ebuf, n));
116 } else {
117 Kokkos::View<PetscScalar *> dst1((PetscScalar *)rootdata, n);
118 Kokkos::View<PetscScalar *> dst2((PetscScalar *)leafdata, n);
119 Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n);
120 Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n);
121 PetscCallCXX(Kokkos::deep_copy(dst1, src1));
122 PetscCallCXX(Kokkos::deep_copy(dst2, src2));
123 }
124
125 if (msgsize > LARGE_MESSAGE_SIZE) {
126 nskip = LAT_SKIP_LARGE;
127 niter = LAT_LOOP_LARGE;
128 }
129 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
130
131 for (i = 0; i < niter + nskip; i++) {
132 if (i == nskip) {
133 PetscCallCXX(Kokkos::fence());
134 PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
135 t_start = MPI_Wtime();
136 }
137 PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op));
138 PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op));
139 PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op));
140 PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op));
141 }
142 PetscCallCXX(Kokkos::fence());
143 PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
144 t_end = MPI_Wtime();
145 time[j] = (t_end - t_start) * 1e6 / (niter * 2);
146 }
147
148 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device"));
149 for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
150 PetscCall(PetscSFDestroy(&sf[j]));
151 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]));
152 }
153 PetscCall(PetscFree2(pbuf, ebuf));
154 if (mtype == PETSC_MEMTYPE_HOST) {
155 PetscCall(PetscFree2(rootdata, leafdata));
156 } else {
157 PetscCallCXX(Kokkos::kokkos_free(rootdata));
158 PetscCallCXX(Kokkos::kokkos_free(leafdata));
159 }
160 PetscCall(PetscFree(iremote));
161 PetscCall(PetscFinalize());
162 Kokkos::finalize();
163 return 0;
164 }
165
166 /*TEST
167 testset:
168 requires: kokkos
169 # use small numbers to make the test cheap
170 args: -maxn 4 -skipSmall 1 -loopSmall 1
171 filter: grep "DOES_NOT_EXIST"
172 output_file: output/empty.out
173 nsize: 2
174
175 test:
176 args: -mtype {{host kokkos}}
177
178 test:
179 requires: cuda mpi_gpu_aware mpix_stream
180 suffix: mpix
181 # MPICH doesn't reserve VCI, and per MPICH developers only 1 VCI is needed for GPU
182 env: MPIR_CVAR_CH4_RESERVE_VCIS=1
183 args: -mtype kokkos -sf_use_stream_aware_mpi 1
184
185 TEST*/
186