1 static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n"; 2 3 /* 4 This is a simple test to measure the latency of MPI communication. 5 The test is run with two processes. The first process sends a message 6 to the second process, and after having received the message, the second 7 process sends a message back to the first process once. The is repeated 8 a number of times. The latency is defined as half time of the round-trip. 9 10 It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/). 11 12 Usage: mpirun -n 2 ./ex4k -mtype <type> 13 Other arguments have a default value that is also used in osu_latency. 14 15 Examples: 16 17 On Summit at OLCF: 18 jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex4k -mtype kokkos 19 20 On Crusher at OLCF: 21 srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex4k -mtype kokkos 22 */ 23 #include <petscsf.h> 24 #include <Kokkos_Core.hpp> 25 26 /* Same values as OSU microbenchmarks */ 27 #define LAT_LOOP_SMALL 10000 28 #define LAT_SKIP_SMALL 100 29 #define LAT_LOOP_LARGE 1000 30 #define LAT_SKIP_LARGE 10 31 #define LARGE_MESSAGE_SIZE 8192 32 33 int main(int argc, char **argv) 34 { 35 PetscSF sf[64]; 36 PetscLogDouble t_start = 0, t_end = 0, time[64]; 37 PetscInt i, j, n, nroots, nleaves, niter = 100, nskip = 10; 38 PetscInt maxn = 512 * 1024; /* max 4M bytes messages */ 39 PetscSFNode *iremote; 40 PetscMPIInt rank, size; 41 PetscScalar *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf; 42 size_t msgsize; 43 PetscMemType mtype = PETSC_MEMTYPE_HOST; 44 char mstring[16] = {0}; 45 PetscBool set; 46 PetscInt skipSmall = -1, loopSmall = -1; 47 MPI_Op op = MPI_REPLACE; 48 49 PetscFunctionBeginUser; 50 Kokkos::initialize(argc, argv); // Test initializing kokkos before petsc 51 PetscCall(PetscInitialize(&argc, &argv, NULL, help)); 52 PetscCall(PetscKokkosInitializeCheck()); 53 54 PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size)); 55 PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank)); 56 PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes"); 57 58 PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */ 59 PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL)); 60 PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL)); 61 62 PetscCall(PetscMalloc1(maxn, &iremote)); 63 PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set)); 64 if (set) { 65 PetscBool isHost, isKokkos; 66 PetscCall(PetscStrcasecmp(mstring, "host", &isHost)); 67 PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos)); 68 if (isHost) mtype = PETSC_MEMTYPE_HOST; 69 else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS; 70 else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring); 71 } 72 73 if (mtype == PETSC_MEMTYPE_HOST) { 74 PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata)); 75 } else { 76 PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn)); 77 PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn)); 78 } 79 PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf)); 80 for (i = 0; i < maxn; i++) { 81 pbuf[i] = 123.0; 82 ebuf[i] = 456.0; 83 } 84 85 for (n = 1, i = 0; n <= maxn; n *= 2, i++) { 86 PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i])); 87 PetscCall(PetscSFSetFromOptions(sf[i])); 88 if (rank == 0) { 89 nroots = n; 90 nleaves = 0; 91 } else { 92 nroots = 0; 93 nleaves = n; 94 for (j = 0; j < nleaves; j++) { 95 iremote[j].rank = 0; 96 iremote[j].index = j; 97 } 98 } 99 PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES)); 100 PetscCall(PetscSFSetUp(sf[i])); 101 } 102 103 if (loopSmall > 0) { 104 nskip = skipSmall; 105 niter = loopSmall; 106 } else { 107 nskip = LAT_SKIP_SMALL; 108 niter = LAT_LOOP_SMALL; 109 } 110 111 for (n = 1, j = 0; n <= maxn; n *= 2, j++) { 112 msgsize = sizeof(PetscScalar) * n; 113 if (mtype == PETSC_MEMTYPE_HOST) { 114 PetscCall(PetscArraycpy(rootdata, pbuf, n)); 115 PetscCall(PetscArraycpy(leafdata, ebuf, n)); 116 } else { 117 Kokkos::View<PetscScalar *> dst1((PetscScalar *)rootdata, n); 118 Kokkos::View<PetscScalar *> dst2((PetscScalar *)leafdata, n); 119 Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n); 120 Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n); 121 PetscCallCXX(Kokkos::deep_copy(dst1, src1)); 122 PetscCallCXX(Kokkos::deep_copy(dst2, src2)); 123 } 124 125 if (msgsize > LARGE_MESSAGE_SIZE) { 126 nskip = LAT_SKIP_LARGE; 127 niter = LAT_LOOP_LARGE; 128 } 129 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 130 131 for (i = 0; i < niter + nskip; i++) { 132 if (i == nskip) { 133 PetscCallCXX(Kokkos::fence()); 134 PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 135 t_start = MPI_Wtime(); 136 } 137 PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op)); 138 PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op)); 139 PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op)); 140 PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op)); 141 } 142 PetscCallCXX(Kokkos::fence()); 143 PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 144 t_end = MPI_Wtime(); 145 time[j] = (t_end - t_start) * 1e6 / (niter * 2); 146 } 147 148 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device")); 149 for (n = 1, j = 0; n <= maxn; n *= 2, j++) { 150 PetscCall(PetscSFDestroy(&sf[j])); 151 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j])); 152 } 153 PetscCall(PetscFree2(pbuf, ebuf)); 154 if (mtype == PETSC_MEMTYPE_HOST) { 155 PetscCall(PetscFree2(rootdata, leafdata)); 156 } else { 157 PetscCallCXX(Kokkos::kokkos_free(rootdata)); 158 PetscCallCXX(Kokkos::kokkos_free(leafdata)); 159 } 160 PetscCall(PetscFree(iremote)); 161 PetscCall(PetscFinalize()); 162 Kokkos::finalize(); 163 return 0; 164 } 165 166 /*TEST 167 testset: 168 requires: kokkos 169 # use small numbers to make the test cheap 170 args: -maxn 4 -skipSmall 1 -loopSmall 1 171 filter: grep "DOES_NOT_EXIST" 172 output_file: output/empty.out 173 nsize: 2 174 175 test: 176 args: -mtype {{host kokkos}} 177 178 test: 179 requires: cuda mpi_gpu_aware mpix_stream 180 suffix: mpix 181 # MPICH doesn't reserve VCI, and per MPICH developers only 1 VCI is needed for GPU 182 env: MPIR_CVAR_CH4_RESERVE_VCIS=1 183 args: -mtype kokkos -sf_use_stream_aware_mpi 1 184 185 TEST*/ 186