1 static char help[] = "Benchmarking MatMult() with AIJ and its subclass matrix types\n"; 2 3 /* 4 Usage: 5 mpirun -n <np> ./ex5k 6 -f <file> : input PETSc matrix binary file; one can convert a file from MatrixMarket using mat/tests/ex72.c 7 -mat_type <type> : aij or its subclass. Default is aij. 8 -n <num> : run MatMult() this many times and report average time. Default is 500. 9 10 Notes: 11 It uses CPU-timer to measure the time. 12 13 Examples: 14 On OLCF Summit (with GPU-aware MPI) 15 # 6 MPI ranks: 16 # 6 resource sets (-n 6), 1 MPI rank per RS (-a 1), 7 CPU cores per RS (-c 7), and 1 GPU per RS (-g 1), 6 RSs per node (-r 6) 17 jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 ./ex5k -f 1138_bus.aij -mat_type aijcusparse 18 19 # 1 MPI rank 20 jsrun --smpiargs "-gpu" -n 1 -a 1 -c 7 -g 1 -r 1 ./ex5k -f 1138_bus.aij -mat_type aijcusparse 21 22 On OLCF Crusher: 23 # 1 MPI rank 24 # run with 1 node (-N1), 1 mpi rank (-n1), 2 hardware threads per rank (-c2) 25 srun -N1 -n1 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex5k -f HV15R.aij -mat_type aijkokkos 26 27 # 8 MPI ranks 28 srun -N1 -n8 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex5k -f HV15R.aij -mat_type aijkokkos 29 */ 30 #include <petscmat.h> 31 #include <petscdevice.h> 32 33 #if defined(PETSC_HAVE_CUDA) 34 #include <petscdevice_cuda.h> 35 #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize()) 36 #elif defined(PETSC_HAVE_HIP) 37 #include <petscdevice_hip.h> 38 #define SyncDevice() PetscCallHIP(hipDeviceSynchronize()) 39 #elif defined(PETSC_HAVE_KOKKOS) 40 #include <Kokkos_Core.hpp> 41 #define SyncDevice() Kokkos::fence() 42 #else 43 #define SyncDevice() 44 #endif 45 46 int main(int argc, char **args) 47 { 48 Mat A, A2; 49 Vec x, y, x2, y2; 50 PetscViewer fd; 51 char matfile[PETSC_MAX_PATH_LEN]; 52 char mattype[64]; 53 PetscBool flg; 54 PetscLogStage stage; 55 PetscInt i, n = 500, nskip = 5, M, N; 56 MatInfo info; 57 PetscLogDouble tstart = 0, tend = 0, avgTime; 58 PetscRandom rctx; 59 PetscReal norm; 60 PetscMPIInt size; 61 62 PetscFunctionBeginUser; 63 PetscCall(PetscInitialize(&argc, &args, nullptr, help)); 64 PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size)); 65 66 /* Read options -n */ 67 PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL)); 68 69 /* Load the matrix from a binary file */ 70 PetscCall(PetscOptionsGetString(NULL, NULL, "-f", matfile, PETSC_MAX_PATH_LEN, &flg)); 71 PetscCheck(flg, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must indicate a PETSc matrix binary file with the -f option"); 72 PetscCall(PetscOptionsGetString(NULL, NULL, "-mat_type", mattype, sizeof(mattype), &flg)); 73 if (!flg) PetscCall(PetscStrncpy(mattype, MATAIJ, sizeof(mattype))); 74 75 /* Read the matrix file to A2 */ 76 PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, matfile, FILE_MODE_READ, &fd)); 77 PetscCall(MatCreate(PETSC_COMM_WORLD, &A2)); 78 PetscCall(MatSetType(A2, MATAIJ)); 79 PetscCall(MatLoad(A2, fd)); 80 PetscCall(MatCreateVecs(A2, &x2, &y2)); 81 PetscCall(PetscViewerDestroy(&fd)); 82 83 PetscCall(MatGetSize(A2, &M, &N)); 84 PetscCall(MatGetInfo(A2, MAT_GLOBAL_SUM, &info)); 85 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix %s: %" PetscInt_FMT " x %" PetscInt_FMT "; %lld nonzeros; %.1f per row\n", matfile, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M)); 86 87 /* Copy A2 to A and convert A to the specified type */ 88 PetscCall(MatDuplicate(A2, MAT_COPY_VALUES, &A)); 89 PetscCall(MatConvert(A, mattype, MAT_INPLACE_MATRIX, &A)); 90 PetscCall(MatCreateVecs(A, &x, &y)); 91 92 /* Init x, x2 with the same value */ 93 PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx)); 94 PetscCall(VecSetRandom(x2, rctx)); 95 PetscCall(PetscRandomDestroy(&rctx)); 96 PetscCall(VecCopy(x2, x)); 97 98 /* Compute the reference y2 = A2 x2 */ 99 PetscCall(MatMult(A2, x2, y2)); 100 101 /* Measure y = Ax */ 102 PetscCall(PetscLogStageRegister("MatMult", &stage)); 103 for (i = 0; i < n + nskip; i++) { 104 if (i == nskip) { 105 SyncDevice(); 106 PetscCall(PetscLogStagePush(stage)); 107 PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 108 PetscCall(PetscTime(&tstart)); 109 } 110 PetscCall(MatMult(A, x, y)); 111 } 112 SyncDevice(); 113 PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 114 PetscCall(PetscTime(&tend)); 115 avgTime = (tend - tstart) * 1e6 / n; /* microseconds */ 116 PetscCall(PetscLogStagePop()); 117 118 /* Validate y against y2 */ 119 PetscCall(VecAYPX(y2, -1, y)); 120 PetscCall(VecNorm(y2, NORM_2, &norm)); 121 PetscCheck(norm < 1e-6, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "MatMult() error with norm %g", (double)norm); 122 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "MatMult() average time (us) with %d MPI ranks = %8.2f\n", size, avgTime)); 123 124 PetscCall(MatDestroy(&A)); 125 PetscCall(VecDestroy(&x)); 126 PetscCall(VecDestroy(&y)); 127 PetscCall(MatDestroy(&A2)); 128 PetscCall(VecDestroy(&x2)); 129 PetscCall(VecDestroy(&y2)); 130 PetscCall(PetscFinalize()); 131 return 0; 132 } 133 134 /*TEST 135 136 testset: 137 args: -n 2 -f ${DATAFILESPATH}/matrices/small 138 nsize: 1 139 filter: grep "DOES_NOT_EXIST" 140 output_file: output/empty.out 141 requires: datafilespath !complex double !defined(PETSC_USE_64BIT_INDICES) kokkos_kernels 142 143 test: 144 suffix: 1 145 requires: cuda 146 args: -mat_type aijcusparse 147 148 test: 149 suffix: 2 150 args: -mat_type aijkokkos 151 152 test: 153 suffix: 3 154 requires: hip 155 args: -mat_type aijhipsparse 156 157 TEST*/ 158