xref: /petsc/src/sys/objects/device/tests/ex1k.kokkos.cxx (revision d52a580b706c59ca78066c1e38754e45b6b56e2b)
1 static char help[] = "Benchmarking device kernel launch time\n";
2 /*
3   Running example on Summit at OLCF:
4   # run with total 1 resource set (RS) (-n1), 1 RS per node (-r1), 1 MPI rank (-a1), 7 cores (-c7) and 1 GPU (-g1) per RS
5   $ jsrun -n1 -a1 -c7 -g1 -r1  ./ex1k
6   Average asynchronous device kernel launch time = 4.86 microseconds
7   Average synchronous device kernel launch time  = 12.83 microseconds
8 
9   Frontier@OLCF
10   $ srun -n1 -c32 --cpu-bind=threads --gpus-per-node=8 --gpu-bind=closest ./ex1k
11   Average asynchronous device kernel launch time = 1.88 microseconds
12   Average synchronous device kernel launch time  = 7.78 microseconds
13 
14   Aurora@ALCF
15   $ mpiexec -n 1 ./ex1k
16   Average asynchronous device kernel launch time = 3.34 microseconds
17   Average synchronous device kernel launch time  = 6.24 microseconds
18 
19   Perlmutter@NERSC
20   $ srun -n 1 --gpus-per-task=1 ./ex1k
21   Average asynchronous device kernel launch time = 2.31 microseconds
22   Average synchronous device kernel launch time  = 7.13 microseconds
23 */
24 
25 #include <petscsys.h>
26 #include <petsc_kokkos.hpp>
27 
28 int main(int argc, char **argv)
29 {
30   PetscInt       i, n = 100000, N = 256;
31   PetscLogDouble tstart, tend, time;
32 
33   PetscFunctionBeginUser;
34   PetscCall(PetscInitialize(&argc, &argv, nullptr, help));
35   PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));
36   PetscCall(PetscKokkosInitializeCheck());
37   {
38     Kokkos::DefaultExecutionSpace                      exec = PetscGetKokkosExecutionSpace();
39     Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy(exec, 0, N);
40 
41     PetscCallCXX(exec.fence()); // Initialize device runtime to get more accurate timing below
42     // Launch a sequence of kernels asynchronously. Previous launched kernels do not need to be completed before launching a new one
43     PetscCall(PetscTime(&tstart));
44     for (i = 0; i < n; i++) PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){}));
45     PetscCall(PetscTime(&tend));
46     PetscCallCXX(exec.fence());
47     time = (tend - tstart) * 1e6 / n;
48     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average asynchronous device kernel launch time = %.2f microseconds\n", time));
49 
50     // Launch a sequence of kernels synchronously. Only launch a new kernel after the one before it has been completed
51     PetscCall(PetscTime(&tstart));
52     for (i = 0; i < n; i++) {
53       PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){}));
54       PetscCallCXX(exec.fence());
55     }
56     PetscCall(PetscTime(&tend));
57     time = (tend - tstart) * 1e6 / n;
58     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average synchronous device kernel launch time  = %.2f microseconds\n", time));
59   }
60 
61   PetscCall(PetscFinalize());
62   return 0;
63 }
64 
65 /*TEST
66   test:
67     requires: kokkos
68     args: -n 2
69     output_file: output/empty.out
70     filter: grep "DOES_NOT_EXIST"
71 
72 TEST*/
73