1 static char help[] = "Benchmarking device kernel launch time\n"; 2 /* 3 Running example on Summit at OLCF: 4 # run with total 1 resource set (RS) (-n1), 1 RS per node (-r1), 1 MPI rank (-a1), 7 cores (-c7) and 1 GPU (-g1) per RS 5 $ jsrun -n1 -a1 -c7 -g1 -r1 ./ex1k 6 Average asynchronous device kernel launch time = 4.86 microseconds 7 Average synchronous device kernel launch time = 12.83 microseconds 8 9 Frontier@OLCF 10 $ srun -n1 -c32 --cpu-bind=threads --gpus-per-node=8 --gpu-bind=closest ./ex1k 11 Average asynchronous device kernel launch time = 1.88 microseconds 12 Average synchronous device kernel launch time = 7.78 microseconds 13 14 Aurora@ALCF 15 $ mpirun -n 1 ./ex1k 16 Average asynchronous device kernel launch time = 3.34 microseconds 17 Average synchronous device kernel launch time = 6.24 microseconds 18 19 Perlmutter@NERSC 20 $ srun -n 1 --gpus-per-task=1 ./ex1k 21 Average asynchronous device kernel launch time = 2.31 microseconds 22 Average synchronous device kernel launch time = 7.13 microseconds 23 */ 24 25 #include <petscsys.h> 26 #include <petsc_kokkos.hpp> 27 28 int main(int argc, char **argv) 29 { 30 PetscInt i, n = 100000, N = 256; 31 PetscLogDouble tstart, tend, time; 32 33 PetscFunctionBeginUser; 34 PetscCall(PetscInitialize(&argc, &argv, nullptr, help)); 35 PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL)); 36 PetscCall(PetscKokkosInitializeCheck()); 37 { 38 Kokkos::DefaultExecutionSpace exec = PetscGetKokkosExecutionSpace(); 39 Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy(exec, 0, N); 40 41 PetscCallCXX(exec.fence()); // Initialize device runtime to get more accurate timing below 42 // Launch a sequence of kernels asynchronously. Previous launched kernels do not need to be completed before launching a new one 43 PetscCall(PetscTime(&tstart)); 44 for (i = 0; i < n; i++) PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){})); 45 PetscCall(PetscTime(&tend)); 46 PetscCallCXX(exec.fence()); 47 time = (tend - tstart) * 1e6 / n; 48 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average asynchronous device kernel launch time = %.2f microseconds\n", time)); 49 50 // Launch a sequence of kernels synchronously. Only launch a new kernel after the one before it has been completed 51 PetscCall(PetscTime(&tstart)); 52 for (i = 0; i < n; i++) { 53 PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){})); 54 PetscCallCXX(exec.fence()); 55 } 56 PetscCall(PetscTime(&tend)); 57 time = (tend - tstart) * 1e6 / n; 58 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average synchronous device kernel launch time = %.2f microseconds\n", time)); 59 } 60 61 PetscCall(PetscFinalize()); 62 return 0; 63 } 64 65 /*TEST 66 test: 67 requires: kokkos 68 args: -n 2 69 output_file: output/empty.out 70 filter: grep "DOES_NOT_EXIST" 71 72 TEST*/ 73