xref: /petsc/src/vec/is/sf/tests/ex4k.kokkos.cxx (revision 6c5693054f5123506dab0f5da2d352ed973d0e50)
1a90d8e38SSatish Balay static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";
2a90d8e38SSatish Balay 
3a90d8e38SSatish Balay /*
4a90d8e38SSatish Balay   This is a simple test to measure the latency of MPI communication.
5a90d8e38SSatish Balay   The test is run with two processes.  The first process sends a message
6a90d8e38SSatish Balay   to the second process, and after having received the message, the second
7a90d8e38SSatish Balay   process sends a message back to the first process once.  The is repeated
8a90d8e38SSatish Balay   a number of times.  The latency is defined as half time of the round-trip.
9a90d8e38SSatish Balay 
10a90d8e38SSatish Balay   It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
11a90d8e38SSatish Balay 
12*a8cf87e0SJunchao Zhang   Usage: mpiexec -n 2 ./ex4k -mtype <type>
13a90d8e38SSatish Balay   Other arguments have a default value that is also used in osu_latency.
14a90d8e38SSatish Balay 
15a90d8e38SSatish Balay   Examples:
16a90d8e38SSatish Balay 
17a90d8e38SSatish Balay   On Summit at OLCF:
18a90d8e38SSatish Balay     jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex4k  -mtype kokkos
19a90d8e38SSatish Balay 
20a90d8e38SSatish Balay   On Crusher at OLCF:
21a90d8e38SSatish Balay     srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex4k -mtype kokkos
22a90d8e38SSatish Balay */
23a90d8e38SSatish Balay #include <petscsf.h>
24a90d8e38SSatish Balay #include <Kokkos_Core.hpp>
25a90d8e38SSatish Balay 
26a90d8e38SSatish Balay /* Same values as OSU microbenchmarks */
27a90d8e38SSatish Balay #define LAT_LOOP_SMALL     10000
28a90d8e38SSatish Balay #define LAT_SKIP_SMALL     100
29a90d8e38SSatish Balay #define LAT_LOOP_LARGE     1000
30a90d8e38SSatish Balay #define LAT_SKIP_LARGE     10
31a90d8e38SSatish Balay #define LARGE_MESSAGE_SIZE 8192
32a90d8e38SSatish Balay 
main(int argc,char ** argv)33a90d8e38SSatish Balay int main(int argc, char **argv)
34a90d8e38SSatish Balay {
35a90d8e38SSatish Balay   PetscSF        sf[64];
36a90d8e38SSatish Balay   PetscLogDouble t_start = 0, t_end = 0, time[64];
37a90d8e38SSatish Balay   PetscInt       i, j, n, nroots, nleaves, niter = 100, nskip = 10;
38a90d8e38SSatish Balay   PetscInt       maxn = 512 * 1024; /* max 4M bytes messages */
39a90d8e38SSatish Balay   PetscSFNode   *iremote;
40a90d8e38SSatish Balay   PetscMPIInt    rank, size;
41a90d8e38SSatish Balay   PetscScalar   *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
42a90d8e38SSatish Balay   size_t         msgsize;
43a90d8e38SSatish Balay   PetscMemType   mtype       = PETSC_MEMTYPE_HOST;
44a90d8e38SSatish Balay   char           mstring[16] = {0};
45a90d8e38SSatish Balay   PetscBool      set;
46a90d8e38SSatish Balay   PetscInt       skipSmall = -1, loopSmall = -1;
47a90d8e38SSatish Balay   MPI_Op         op = MPI_REPLACE;
48a90d8e38SSatish Balay 
49a90d8e38SSatish Balay   PetscFunctionBeginUser;
50a90d8e38SSatish Balay   Kokkos::initialize(argc, argv); // Test initializing kokkos before petsc
51a90d8e38SSatish Balay   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
52a90d8e38SSatish Balay   PetscCall(PetscKokkosInitializeCheck());
53a90d8e38SSatish Balay 
54a90d8e38SSatish Balay   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
55a90d8e38SSatish Balay   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
56a90d8e38SSatish Balay   PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes");
57a90d8e38SSatish Balay 
58a90d8e38SSatish Balay   PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */
59a90d8e38SSatish Balay   PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL));
60a90d8e38SSatish Balay   PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL));
61a90d8e38SSatish Balay 
62a90d8e38SSatish Balay   PetscCall(PetscMalloc1(maxn, &iremote));
63a90d8e38SSatish Balay   PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set));
64a90d8e38SSatish Balay   if (set) {
65a90d8e38SSatish Balay     PetscBool isHost, isKokkos;
66a90d8e38SSatish Balay     PetscCall(PetscStrcasecmp(mstring, "host", &isHost));
67a90d8e38SSatish Balay     PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos));
68a90d8e38SSatish Balay     if (isHost) mtype = PETSC_MEMTYPE_HOST;
69a90d8e38SSatish Balay     else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS;
70a90d8e38SSatish Balay     else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
71a90d8e38SSatish Balay   }
72a90d8e38SSatish Balay 
73a90d8e38SSatish Balay   if (mtype == PETSC_MEMTYPE_HOST) {
74a90d8e38SSatish Balay     PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata));
75a90d8e38SSatish Balay   } else {
76a90d8e38SSatish Balay     PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
77a90d8e38SSatish Balay     PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
78a90d8e38SSatish Balay   }
79a90d8e38SSatish Balay   PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf));
80a90d8e38SSatish Balay   for (i = 0; i < maxn; i++) {
81a90d8e38SSatish Balay     pbuf[i] = 123.0;
82a90d8e38SSatish Balay     ebuf[i] = 456.0;
83a90d8e38SSatish Balay   }
84a90d8e38SSatish Balay 
85a90d8e38SSatish Balay   for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
86a90d8e38SSatish Balay     PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i]));
87a90d8e38SSatish Balay     PetscCall(PetscSFSetFromOptions(sf[i]));
88a90d8e38SSatish Balay     if (rank == 0) {
89a90d8e38SSatish Balay       nroots  = n;
90a90d8e38SSatish Balay       nleaves = 0;
91a90d8e38SSatish Balay     } else {
92a90d8e38SSatish Balay       nroots  = 0;
93a90d8e38SSatish Balay       nleaves = n;
94a90d8e38SSatish Balay       for (j = 0; j < nleaves; j++) {
95a90d8e38SSatish Balay         iremote[j].rank  = 0;
96a90d8e38SSatish Balay         iremote[j].index = j;
97a90d8e38SSatish Balay       }
98a90d8e38SSatish Balay     }
99a90d8e38SSatish Balay     PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES));
100a90d8e38SSatish Balay     PetscCall(PetscSFSetUp(sf[i]));
101a90d8e38SSatish Balay   }
102a90d8e38SSatish Balay 
103a90d8e38SSatish Balay   if (loopSmall > 0) {
104a90d8e38SSatish Balay     nskip = skipSmall;
105a90d8e38SSatish Balay     niter = loopSmall;
106a90d8e38SSatish Balay   } else {
107a90d8e38SSatish Balay     nskip = LAT_SKIP_SMALL;
108a90d8e38SSatish Balay     niter = LAT_LOOP_SMALL;
109a90d8e38SSatish Balay   }
110a90d8e38SSatish Balay 
111a90d8e38SSatish Balay   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
112a90d8e38SSatish Balay     msgsize = sizeof(PetscScalar) * n;
113a90d8e38SSatish Balay     if (mtype == PETSC_MEMTYPE_HOST) {
114a90d8e38SSatish Balay       PetscCall(PetscArraycpy(rootdata, pbuf, n));
115a90d8e38SSatish Balay       PetscCall(PetscArraycpy(leafdata, ebuf, n));
116a90d8e38SSatish Balay     } else {
117a90d8e38SSatish Balay       Kokkos::View<PetscScalar *>                          dst1((PetscScalar *)rootdata, n);
118a90d8e38SSatish Balay       Kokkos::View<PetscScalar *>                          dst2((PetscScalar *)leafdata, n);
119a90d8e38SSatish Balay       Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n);
120a90d8e38SSatish Balay       Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n);
121a90d8e38SSatish Balay       PetscCallCXX(Kokkos::deep_copy(dst1, src1));
122a90d8e38SSatish Balay       PetscCallCXX(Kokkos::deep_copy(dst2, src2));
123a90d8e38SSatish Balay     }
124a90d8e38SSatish Balay 
125a90d8e38SSatish Balay     if (msgsize > LARGE_MESSAGE_SIZE) {
126a90d8e38SSatish Balay       nskip = LAT_SKIP_LARGE;
127a90d8e38SSatish Balay       niter = LAT_LOOP_LARGE;
128a90d8e38SSatish Balay     }
129a90d8e38SSatish Balay     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
130a90d8e38SSatish Balay 
131a90d8e38SSatish Balay     for (i = 0; i < niter + nskip; i++) {
132a90d8e38SSatish Balay       if (i == nskip) {
133a90d8e38SSatish Balay         PetscCallCXX(Kokkos::fence());
134a90d8e38SSatish Balay         PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
135a90d8e38SSatish Balay         t_start = MPI_Wtime();
136a90d8e38SSatish Balay       }
137a90d8e38SSatish Balay       PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op));
138a90d8e38SSatish Balay       PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op));
139a90d8e38SSatish Balay       PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op));
140a90d8e38SSatish Balay       PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op));
141a90d8e38SSatish Balay     }
142a90d8e38SSatish Balay     PetscCallCXX(Kokkos::fence());
143a90d8e38SSatish Balay     PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
144a90d8e38SSatish Balay     t_end   = MPI_Wtime();
145a90d8e38SSatish Balay     time[j] = (t_end - t_start) * 1e6 / (niter * 2);
146a90d8e38SSatish Balay   }
147a90d8e38SSatish Balay 
148a90d8e38SSatish Balay   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t##  PetscSF Ping-pong test on %s ##\n  Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device"));
149a90d8e38SSatish Balay   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
150a90d8e38SSatish Balay     PetscCall(PetscSFDestroy(&sf[j]));
151a90d8e38SSatish Balay     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]));
152a90d8e38SSatish Balay   }
153a90d8e38SSatish Balay   PetscCall(PetscFree2(pbuf, ebuf));
154a90d8e38SSatish Balay   if (mtype == PETSC_MEMTYPE_HOST) {
155a90d8e38SSatish Balay     PetscCall(PetscFree2(rootdata, leafdata));
156a90d8e38SSatish Balay   } else {
157a90d8e38SSatish Balay     PetscCallCXX(Kokkos::kokkos_free(rootdata));
158a90d8e38SSatish Balay     PetscCallCXX(Kokkos::kokkos_free(leafdata));
159a90d8e38SSatish Balay   }
160a90d8e38SSatish Balay   PetscCall(PetscFree(iremote));
161a90d8e38SSatish Balay   PetscCall(PetscFinalize());
162a90d8e38SSatish Balay   Kokkos::finalize();
163a90d8e38SSatish Balay   return 0;
164a90d8e38SSatish Balay }
165a90d8e38SSatish Balay 
166a90d8e38SSatish Balay /*TEST
167a90d8e38SSatish Balay   testset:
168a90d8e38SSatish Balay     requires: kokkos
169a90d8e38SSatish Balay     # use small numbers to make the test cheap
170a90d8e38SSatish Balay     args: -maxn 4 -skipSmall 1 -loopSmall 1
171a90d8e38SSatish Balay     filter: grep "DOES_NOT_EXIST"
172a90d8e38SSatish Balay     output_file: output/empty.out
173a90d8e38SSatish Balay     nsize: 2
174a90d8e38SSatish Balay 
175a90d8e38SSatish Balay     test:
176a90d8e38SSatish Balay       args: -mtype {{host kokkos}}
177a90d8e38SSatish Balay 
178a90d8e38SSatish Balay     test:
179a90d8e38SSatish Balay       requires: cuda mpi_gpu_aware mpix_stream
180a90d8e38SSatish Balay       suffix: mpix
181a90d8e38SSatish Balay       # MPICH doesn't reserve VCI, and per MPICH developers only 1 VCI is needed for GPU
182a90d8e38SSatish Balay       env: MPIR_CVAR_CH4_RESERVE_VCIS=1
183a90d8e38SSatish Balay       args: -mtype kokkos -sf_use_stream_aware_mpi 1
184a90d8e38SSatish Balay 
185a90d8e38SSatish Balay TEST*/
186