1a90d8e38SSatish Balay static char help[] = "Benchmarking MatMult() with AIJ and its subclass matrix types\n";
2a90d8e38SSatish Balay
3a90d8e38SSatish Balay /*
4a90d8e38SSatish Balay Usage:
5*a8cf87e0SJunchao Zhang mpiexec -n <np> ./ex5k
6a90d8e38SSatish Balay -f <file> : input PETSc matrix binary file; one can convert a file from MatrixMarket using mat/tests/ex72.c
7a90d8e38SSatish Balay -mat_type <type> : aij or its subclass. Default is aij.
8a90d8e38SSatish Balay -n <num> : run MatMult() this many times and report average time. Default is 500.
9a90d8e38SSatish Balay
10a90d8e38SSatish Balay Notes:
11a90d8e38SSatish Balay It uses CPU-timer to measure the time.
12a90d8e38SSatish Balay
13a90d8e38SSatish Balay Examples:
14a90d8e38SSatish Balay On OLCF Summit (with GPU-aware MPI)
15a90d8e38SSatish Balay # 6 MPI ranks:
16a90d8e38SSatish Balay # 6 resource sets (-n 6), 1 MPI rank per RS (-a 1), 7 CPU cores per RS (-c 7), and 1 GPU per RS (-g 1), 6 RSs per node (-r 6)
17a90d8e38SSatish Balay jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 ./ex5k -f 1138_bus.aij -mat_type aijcusparse
18a90d8e38SSatish Balay
19a90d8e38SSatish Balay # 1 MPI rank
20a90d8e38SSatish Balay jsrun --smpiargs "-gpu" -n 1 -a 1 -c 7 -g 1 -r 1 ./ex5k -f 1138_bus.aij -mat_type aijcusparse
21a90d8e38SSatish Balay
22a90d8e38SSatish Balay On OLCF Crusher:
23a90d8e38SSatish Balay # 1 MPI rank
24a90d8e38SSatish Balay # run with 1 node (-N1), 1 mpi rank (-n1), 2 hardware threads per rank (-c2)
25a90d8e38SSatish Balay srun -N1 -n1 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex5k -f HV15R.aij -mat_type aijkokkos
26a90d8e38SSatish Balay
27a90d8e38SSatish Balay # 8 MPI ranks
28a90d8e38SSatish Balay srun -N1 -n8 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex5k -f HV15R.aij -mat_type aijkokkos
29a90d8e38SSatish Balay */
30a90d8e38SSatish Balay #include <petscmat.h>
31a90d8e38SSatish Balay #include <petscdevice.h>
32a90d8e38SSatish Balay
33a90d8e38SSatish Balay #if defined(PETSC_HAVE_CUDA)
34a90d8e38SSatish Balay #include <petscdevice_cuda.h>
35a90d8e38SSatish Balay #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
36a90d8e38SSatish Balay #elif defined(PETSC_HAVE_HIP)
37a90d8e38SSatish Balay #include <petscdevice_hip.h>
38a90d8e38SSatish Balay #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
39a90d8e38SSatish Balay #elif defined(PETSC_HAVE_KOKKOS)
40a90d8e38SSatish Balay #include <Kokkos_Core.hpp>
41a90d8e38SSatish Balay #define SyncDevice() Kokkos::fence()
42a90d8e38SSatish Balay #else
43a90d8e38SSatish Balay #define SyncDevice()
44a90d8e38SSatish Balay #endif
45a90d8e38SSatish Balay
main(int argc,char ** args)46a90d8e38SSatish Balay int main(int argc, char **args)
47a90d8e38SSatish Balay {
48a90d8e38SSatish Balay Mat A, A2;
49a90d8e38SSatish Balay Vec x, y, x2, y2;
50a90d8e38SSatish Balay PetscViewer fd;
51a90d8e38SSatish Balay char matfile[PETSC_MAX_PATH_LEN];
52a90d8e38SSatish Balay char mattype[64];
53a90d8e38SSatish Balay PetscBool flg;
54a90d8e38SSatish Balay PetscLogStage stage;
55a90d8e38SSatish Balay PetscInt i, n = 500, nskip = 5, M, N;
56a90d8e38SSatish Balay MatInfo info;
57a90d8e38SSatish Balay PetscLogDouble tstart = 0, tend = 0, avgTime;
58a90d8e38SSatish Balay PetscRandom rctx;
59a90d8e38SSatish Balay PetscReal norm;
60a90d8e38SSatish Balay PetscMPIInt size;
61a90d8e38SSatish Balay
62a90d8e38SSatish Balay PetscFunctionBeginUser;
63a90d8e38SSatish Balay PetscCall(PetscInitialize(&argc, &args, nullptr, help));
64a90d8e38SSatish Balay PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
65a90d8e38SSatish Balay
66a90d8e38SSatish Balay /* Read options -n */
67a90d8e38SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));
68a90d8e38SSatish Balay
69a90d8e38SSatish Balay /* Load the matrix from a binary file */
70a90d8e38SSatish Balay PetscCall(PetscOptionsGetString(NULL, NULL, "-f", matfile, PETSC_MAX_PATH_LEN, &flg));
71a90d8e38SSatish Balay PetscCheck(flg, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must indicate a PETSc matrix binary file with the -f option");
72a90d8e38SSatish Balay PetscCall(PetscOptionsGetString(NULL, NULL, "-mat_type", mattype, sizeof(mattype), &flg));
73a90d8e38SSatish Balay if (!flg) PetscCall(PetscStrncpy(mattype, MATAIJ, sizeof(mattype)));
74a90d8e38SSatish Balay
75a90d8e38SSatish Balay /* Read the matrix file to A2 */
76a90d8e38SSatish Balay PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, matfile, FILE_MODE_READ, &fd));
77a90d8e38SSatish Balay PetscCall(MatCreate(PETSC_COMM_WORLD, &A2));
78a90d8e38SSatish Balay PetscCall(MatSetType(A2, MATAIJ));
79a90d8e38SSatish Balay PetscCall(MatLoad(A2, fd));
80a90d8e38SSatish Balay PetscCall(MatCreateVecs(A2, &x2, &y2));
81a90d8e38SSatish Balay PetscCall(PetscViewerDestroy(&fd));
82a90d8e38SSatish Balay
83a90d8e38SSatish Balay PetscCall(MatGetSize(A2, &M, &N));
84a90d8e38SSatish Balay PetscCall(MatGetInfo(A2, MAT_GLOBAL_SUM, &info));
85a90d8e38SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix %s: %" PetscInt_FMT " x %" PetscInt_FMT "; %lld nonzeros; %.1f per row\n", matfile, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M));
86a90d8e38SSatish Balay
87a90d8e38SSatish Balay /* Copy A2 to A and convert A to the specified type */
88a90d8e38SSatish Balay PetscCall(MatDuplicate(A2, MAT_COPY_VALUES, &A));
89a90d8e38SSatish Balay PetscCall(MatConvert(A, mattype, MAT_INPLACE_MATRIX, &A));
90a90d8e38SSatish Balay PetscCall(MatCreateVecs(A, &x, &y));
91a90d8e38SSatish Balay
92a90d8e38SSatish Balay /* Init x, x2 with the same value */
93a90d8e38SSatish Balay PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx));
94a90d8e38SSatish Balay PetscCall(VecSetRandom(x2, rctx));
95a90d8e38SSatish Balay PetscCall(PetscRandomDestroy(&rctx));
96a90d8e38SSatish Balay PetscCall(VecCopy(x2, x));
97a90d8e38SSatish Balay
98a90d8e38SSatish Balay /* Compute the reference y2 = A2 x2 */
99a90d8e38SSatish Balay PetscCall(MatMult(A2, x2, y2));
100a90d8e38SSatish Balay
101a90d8e38SSatish Balay /* Measure y = Ax */
102a90d8e38SSatish Balay PetscCall(PetscLogStageRegister("MatMult", &stage));
103a90d8e38SSatish Balay for (i = 0; i < n + nskip; i++) {
104a90d8e38SSatish Balay if (i == nskip) {
105a90d8e38SSatish Balay SyncDevice();
106a90d8e38SSatish Balay PetscCall(PetscLogStagePush(stage));
107a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
108a90d8e38SSatish Balay PetscCall(PetscTime(&tstart));
109a90d8e38SSatish Balay }
110a90d8e38SSatish Balay PetscCall(MatMult(A, x, y));
111a90d8e38SSatish Balay }
112a90d8e38SSatish Balay SyncDevice();
113a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
114a90d8e38SSatish Balay PetscCall(PetscTime(&tend));
115a90d8e38SSatish Balay avgTime = (tend - tstart) * 1e6 / n; /* microseconds */
116a90d8e38SSatish Balay PetscCall(PetscLogStagePop());
117a90d8e38SSatish Balay
118a90d8e38SSatish Balay /* Validate y against y2 */
119a90d8e38SSatish Balay PetscCall(VecAYPX(y2, -1, y));
120a90d8e38SSatish Balay PetscCall(VecNorm(y2, NORM_2, &norm));
121a90d8e38SSatish Balay PetscCheck(norm < 1e-6, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "MatMult() error with norm %g", (double)norm);
122a90d8e38SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "MatMult() average time (us) with %d MPI ranks = %8.2f\n", size, avgTime));
123a90d8e38SSatish Balay
124a90d8e38SSatish Balay PetscCall(MatDestroy(&A));
125a90d8e38SSatish Balay PetscCall(VecDestroy(&x));
126a90d8e38SSatish Balay PetscCall(VecDestroy(&y));
127a90d8e38SSatish Balay PetscCall(MatDestroy(&A2));
128a90d8e38SSatish Balay PetscCall(VecDestroy(&x2));
129a90d8e38SSatish Balay PetscCall(VecDestroy(&y2));
130a90d8e38SSatish Balay PetscCall(PetscFinalize());
131a90d8e38SSatish Balay return 0;
132a90d8e38SSatish Balay }
133a90d8e38SSatish Balay
134a90d8e38SSatish Balay /*TEST
135a90d8e38SSatish Balay
136a90d8e38SSatish Balay testset:
137a90d8e38SSatish Balay args: -n 2 -f ${DATAFILESPATH}/matrices/small
138a90d8e38SSatish Balay nsize: 1
139a90d8e38SSatish Balay filter: grep "DOES_NOT_EXIST"
140a90d8e38SSatish Balay output_file: output/empty.out
141a90d8e38SSatish Balay requires: datafilespath !complex double !defined(PETSC_USE_64BIT_INDICES) kokkos_kernels
142a90d8e38SSatish Balay
143a90d8e38SSatish Balay test:
144a90d8e38SSatish Balay suffix: 1
145a90d8e38SSatish Balay requires: cuda
146a90d8e38SSatish Balay args: -mat_type aijcusparse
147a90d8e38SSatish Balay
148a90d8e38SSatish Balay test:
149a90d8e38SSatish Balay suffix: 2
150a90d8e38SSatish Balay args: -mat_type aijkokkos
151a90d8e38SSatish Balay
152a90d8e38SSatish Balay test:
153a90d8e38SSatish Balay suffix: 3
154a90d8e38SSatish Balay requires: hip
155a90d8e38SSatish Balay args: -mat_type aijhipsparse
156a90d8e38SSatish Balay
157a90d8e38SSatish Balay TEST*/
158