1896e5da2SSatish Balay static char help[] = "Benchmarking device kernel launch time\n";
2896e5da2SSatish Balay /*
3896e5da2SSatish Balay Running example on Summit at OLCF:
4896e5da2SSatish Balay # run with total 1 resource set (RS) (-n1), 1 RS per node (-r1), 1 MPI rank (-a1), 7 cores (-c7) and 1 GPU (-g1) per RS
5896e5da2SSatish Balay $ jsrun -n1 -a1 -c7 -g1 -r1 ./ex1k
6896e5da2SSatish Balay Average asynchronous device kernel launch time = 4.86 microseconds
7896e5da2SSatish Balay Average synchronous device kernel launch time = 12.83 microseconds
8896e5da2SSatish Balay
9896e5da2SSatish Balay Frontier@OLCF
10896e5da2SSatish Balay $ srun -n1 -c32 --cpu-bind=threads --gpus-per-node=8 --gpu-bind=closest ./ex1k
11896e5da2SSatish Balay Average asynchronous device kernel launch time = 1.88 microseconds
12896e5da2SSatish Balay Average synchronous device kernel launch time = 7.78 microseconds
13896e5da2SSatish Balay
14896e5da2SSatish Balay Aurora@ALCF
15*a8cf87e0SJunchao Zhang $ mpiexec -n 1 ./ex1k
16896e5da2SSatish Balay Average asynchronous device kernel launch time = 3.34 microseconds
17896e5da2SSatish Balay Average synchronous device kernel launch time = 6.24 microseconds
18896e5da2SSatish Balay
19896e5da2SSatish Balay Perlmutter@NERSC
20896e5da2SSatish Balay $ srun -n 1 --gpus-per-task=1 ./ex1k
21896e5da2SSatish Balay Average asynchronous device kernel launch time = 2.31 microseconds
22896e5da2SSatish Balay Average synchronous device kernel launch time = 7.13 microseconds
23896e5da2SSatish Balay */
24896e5da2SSatish Balay
25896e5da2SSatish Balay #include <petscsys.h>
26896e5da2SSatish Balay #include <petsc_kokkos.hpp>
27896e5da2SSatish Balay
main(int argc,char ** argv)28896e5da2SSatish Balay int main(int argc, char **argv)
29896e5da2SSatish Balay {
30896e5da2SSatish Balay PetscInt i, n = 100000, N = 256;
31896e5da2SSatish Balay PetscLogDouble tstart, tend, time;
32896e5da2SSatish Balay
33896e5da2SSatish Balay PetscFunctionBeginUser;
34896e5da2SSatish Balay PetscCall(PetscInitialize(&argc, &argv, nullptr, help));
35896e5da2SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));
36896e5da2SSatish Balay PetscCall(PetscKokkosInitializeCheck());
37896e5da2SSatish Balay {
38896e5da2SSatish Balay Kokkos::DefaultExecutionSpace exec = PetscGetKokkosExecutionSpace();
39896e5da2SSatish Balay Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy(exec, 0, N);
40896e5da2SSatish Balay
41896e5da2SSatish Balay PetscCallCXX(exec.fence()); // Initialize device runtime to get more accurate timing below
42896e5da2SSatish Balay // Launch a sequence of kernels asynchronously. Previous launched kernels do not need to be completed before launching a new one
43896e5da2SSatish Balay PetscCall(PetscTime(&tstart));
44896e5da2SSatish Balay for (i = 0; i < n; i++) PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){}));
45896e5da2SSatish Balay PetscCall(PetscTime(&tend));
46896e5da2SSatish Balay PetscCallCXX(exec.fence());
47896e5da2SSatish Balay time = (tend - tstart) * 1e6 / n;
48896e5da2SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average asynchronous device kernel launch time = %.2f microseconds\n", time));
49896e5da2SSatish Balay
50896e5da2SSatish Balay // Launch a sequence of kernels synchronously. Only launch a new kernel after the one before it has been completed
51896e5da2SSatish Balay PetscCall(PetscTime(&tstart));
52896e5da2SSatish Balay for (i = 0; i < n; i++) {
53896e5da2SSatish Balay PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){}));
54896e5da2SSatish Balay PetscCallCXX(exec.fence());
55896e5da2SSatish Balay }
56896e5da2SSatish Balay PetscCall(PetscTime(&tend));
57896e5da2SSatish Balay time = (tend - tstart) * 1e6 / n;
58896e5da2SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average synchronous device kernel launch time = %.2f microseconds\n", time));
59896e5da2SSatish Balay }
60896e5da2SSatish Balay
61896e5da2SSatish Balay PetscCall(PetscFinalize());
62896e5da2SSatish Balay return 0;
63896e5da2SSatish Balay }
64896e5da2SSatish Balay
65896e5da2SSatish Balay /*TEST
66896e5da2SSatish Balay test:
67896e5da2SSatish Balay requires: kokkos
68896e5da2SSatish Balay args: -n 2
69896e5da2SSatish Balay output_file: output/empty.out
70896e5da2SSatish Balay filter: grep "DOES_NOT_EXIST"
71896e5da2SSatish Balay
72896e5da2SSatish Balay TEST*/
73