xref: /petsc/src/vec/is/sf/tests/ex4k.kokkos.cxx (revision 6c5693054f5123506dab0f5da2d352ed973d0e50)
1 static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";
2 
3 /*
4   This is a simple test to measure the latency of MPI communication.
5   The test is run with two processes.  The first process sends a message
6   to the second process, and after having received the message, the second
7   process sends a message back to the first process once.  The is repeated
8   a number of times.  The latency is defined as half time of the round-trip.
9 
10   It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
11 
12   Usage: mpiexec -n 2 ./ex4k -mtype <type>
13   Other arguments have a default value that is also used in osu_latency.
14 
15   Examples:
16 
17   On Summit at OLCF:
18     jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex4k  -mtype kokkos
19 
20   On Crusher at OLCF:
21     srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex4k -mtype kokkos
22 */
23 #include <petscsf.h>
24 #include <Kokkos_Core.hpp>
25 
26 /* Same values as OSU microbenchmarks */
27 #define LAT_LOOP_SMALL     10000
28 #define LAT_SKIP_SMALL     100
29 #define LAT_LOOP_LARGE     1000
30 #define LAT_SKIP_LARGE     10
31 #define LARGE_MESSAGE_SIZE 8192
32 
main(int argc,char ** argv)33 int main(int argc, char **argv)
34 {
35   PetscSF        sf[64];
36   PetscLogDouble t_start = 0, t_end = 0, time[64];
37   PetscInt       i, j, n, nroots, nleaves, niter = 100, nskip = 10;
38   PetscInt       maxn = 512 * 1024; /* max 4M bytes messages */
39   PetscSFNode   *iremote;
40   PetscMPIInt    rank, size;
41   PetscScalar   *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
42   size_t         msgsize;
43   PetscMemType   mtype       = PETSC_MEMTYPE_HOST;
44   char           mstring[16] = {0};
45   PetscBool      set;
46   PetscInt       skipSmall = -1, loopSmall = -1;
47   MPI_Op         op = MPI_REPLACE;
48 
49   PetscFunctionBeginUser;
50   Kokkos::initialize(argc, argv); // Test initializing kokkos before petsc
51   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
52   PetscCall(PetscKokkosInitializeCheck());
53 
54   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
55   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
56   PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes");
57 
58   PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */
59   PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL));
60   PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL));
61 
62   PetscCall(PetscMalloc1(maxn, &iremote));
63   PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set));
64   if (set) {
65     PetscBool isHost, isKokkos;
66     PetscCall(PetscStrcasecmp(mstring, "host", &isHost));
67     PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos));
68     if (isHost) mtype = PETSC_MEMTYPE_HOST;
69     else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS;
70     else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
71   }
72 
73   if (mtype == PETSC_MEMTYPE_HOST) {
74     PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata));
75   } else {
76     PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
77     PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
78   }
79   PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf));
80   for (i = 0; i < maxn; i++) {
81     pbuf[i] = 123.0;
82     ebuf[i] = 456.0;
83   }
84 
85   for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
86     PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i]));
87     PetscCall(PetscSFSetFromOptions(sf[i]));
88     if (rank == 0) {
89       nroots  = n;
90       nleaves = 0;
91     } else {
92       nroots  = 0;
93       nleaves = n;
94       for (j = 0; j < nleaves; j++) {
95         iremote[j].rank  = 0;
96         iremote[j].index = j;
97       }
98     }
99     PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES));
100     PetscCall(PetscSFSetUp(sf[i]));
101   }
102 
103   if (loopSmall > 0) {
104     nskip = skipSmall;
105     niter = loopSmall;
106   } else {
107     nskip = LAT_SKIP_SMALL;
108     niter = LAT_LOOP_SMALL;
109   }
110 
111   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
112     msgsize = sizeof(PetscScalar) * n;
113     if (mtype == PETSC_MEMTYPE_HOST) {
114       PetscCall(PetscArraycpy(rootdata, pbuf, n));
115       PetscCall(PetscArraycpy(leafdata, ebuf, n));
116     } else {
117       Kokkos::View<PetscScalar *>                          dst1((PetscScalar *)rootdata, n);
118       Kokkos::View<PetscScalar *>                          dst2((PetscScalar *)leafdata, n);
119       Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n);
120       Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n);
121       PetscCallCXX(Kokkos::deep_copy(dst1, src1));
122       PetscCallCXX(Kokkos::deep_copy(dst2, src2));
123     }
124 
125     if (msgsize > LARGE_MESSAGE_SIZE) {
126       nskip = LAT_SKIP_LARGE;
127       niter = LAT_LOOP_LARGE;
128     }
129     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
130 
131     for (i = 0; i < niter + nskip; i++) {
132       if (i == nskip) {
133         PetscCallCXX(Kokkos::fence());
134         PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
135         t_start = MPI_Wtime();
136       }
137       PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op));
138       PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op));
139       PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op));
140       PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op));
141     }
142     PetscCallCXX(Kokkos::fence());
143     PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
144     t_end   = MPI_Wtime();
145     time[j] = (t_end - t_start) * 1e6 / (niter * 2);
146   }
147 
148   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t##  PetscSF Ping-pong test on %s ##\n  Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device"));
149   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
150     PetscCall(PetscSFDestroy(&sf[j]));
151     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]));
152   }
153   PetscCall(PetscFree2(pbuf, ebuf));
154   if (mtype == PETSC_MEMTYPE_HOST) {
155     PetscCall(PetscFree2(rootdata, leafdata));
156   } else {
157     PetscCallCXX(Kokkos::kokkos_free(rootdata));
158     PetscCallCXX(Kokkos::kokkos_free(leafdata));
159   }
160   PetscCall(PetscFree(iremote));
161   PetscCall(PetscFinalize());
162   Kokkos::finalize();
163   return 0;
164 }
165 
166 /*TEST
167   testset:
168     requires: kokkos
169     # use small numbers to make the test cheap
170     args: -maxn 4 -skipSmall 1 -loopSmall 1
171     filter: grep "DOES_NOT_EXIST"
172     output_file: output/empty.out
173     nsize: 2
174 
175     test:
176       args: -mtype {{host kokkos}}
177 
178     test:
179       requires: cuda mpi_gpu_aware mpix_stream
180       suffix: mpix
181       # MPICH doesn't reserve VCI, and per MPICH developers only 1 VCI is needed for GPU
182       env: MPIR_CVAR_CH4_RESERVE_VCIS=1
183       args: -mtype kokkos -sf_use_stream_aware_mpi 1
184 
185 TEST*/
186