1 static char help[] = "Benchmarking device kernel launch time\n";
2 /*
3 Running example on Summit at OLCF:
4 # run with total 1 resource set (RS) (-n1), 1 RS per node (-r1), 1 MPI rank (-a1), 7 cores (-c7) and 1 GPU (-g1) per RS
5 $ jsrun -n1 -a1 -c7 -g1 -r1 ./ex1k
6 Average asynchronous device kernel launch time = 4.86 microseconds
7 Average synchronous device kernel launch time = 12.83 microseconds
8
9 Frontier@OLCF
10 $ srun -n1 -c32 --cpu-bind=threads --gpus-per-node=8 --gpu-bind=closest ./ex1k
11 Average asynchronous device kernel launch time = 1.88 microseconds
12 Average synchronous device kernel launch time = 7.78 microseconds
13
14 Aurora@ALCF
15 $ mpiexec -n 1 ./ex1k
16 Average asynchronous device kernel launch time = 3.34 microseconds
17 Average synchronous device kernel launch time = 6.24 microseconds
18
19 Perlmutter@NERSC
20 $ srun -n 1 --gpus-per-task=1 ./ex1k
21 Average asynchronous device kernel launch time = 2.31 microseconds
22 Average synchronous device kernel launch time = 7.13 microseconds
23 */
24
25 #include <petscsys.h>
26 #include <petsc_kokkos.hpp>
27
main(int argc,char ** argv)28 int main(int argc, char **argv)
29 {
30 PetscInt i, n = 100000, N = 256;
31 PetscLogDouble tstart, tend, time;
32
33 PetscFunctionBeginUser;
34 PetscCall(PetscInitialize(&argc, &argv, nullptr, help));
35 PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));
36 PetscCall(PetscKokkosInitializeCheck());
37 {
38 Kokkos::DefaultExecutionSpace exec = PetscGetKokkosExecutionSpace();
39 Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy(exec, 0, N);
40
41 PetscCallCXX(exec.fence()); // Initialize device runtime to get more accurate timing below
42 // Launch a sequence of kernels asynchronously. Previous launched kernels do not need to be completed before launching a new one
43 PetscCall(PetscTime(&tstart));
44 for (i = 0; i < n; i++) PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){}));
45 PetscCall(PetscTime(&tend));
46 PetscCallCXX(exec.fence());
47 time = (tend - tstart) * 1e6 / n;
48 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average asynchronous device kernel launch time = %.2f microseconds\n", time));
49
50 // Launch a sequence of kernels synchronously. Only launch a new kernel after the one before it has been completed
51 PetscCall(PetscTime(&tstart));
52 for (i = 0; i < n; i++) {
53 PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){}));
54 PetscCallCXX(exec.fence());
55 }
56 PetscCall(PetscTime(&tend));
57 time = (tend - tstart) * 1e6 / n;
58 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average synchronous device kernel launch time = %.2f microseconds\n", time));
59 }
60
61 PetscCall(PetscFinalize());
62 return 0;
63 }
64
65 /*TEST
66 test:
67 requires: kokkos
68 args: -n 2
69 output_file: output/empty.out
70 filter: grep "DOES_NOT_EXIST"
71
72 TEST*/
73