xref: /petsc/src/mat/tests/ex237.c (revision a69119a591a03a9d906b29c0a4e9802e4d7c9795)
1 static char help[] = "Mini-app to benchmark matrix--matrix multiplication\n\n";
2 
3 /*
4   See the paper below for more information
5 
6    "KSPHPDDM and PCHPDDM: Extending PETSc with Robust Overlapping Schwarz Preconditioners and Advanced Krylov Methods",
7    P. Jolivet, J. E. Roman, and S. Zampini (2020).
8 */
9 
10 #include <petsc.h>
11 
12 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
13 #include <mkl.h>
14 #define PetscCallMKLSparse(func, args) \
15   do { \
16     sparse_status_t __ierr; \
17     PetscStackPushExternal(#func); \
18     __ierr = func args; \
19     PetscStackPop; \
20     PetscCheck(__ierr == SPARSE_STATUS_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_LIB, "Error in %s(): error code %d", #func, (int)__ierr); \
21   } while (0)
22 #else
23 #define PetscCallMKLSparse(func, args) \
24   do { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No MKL support"); } while (0)
25 #endif
26 
27 int main(int argc, char **argv) {
28   Mat         A, C, D, E;
29   PetscInt    nbs = 10, ntype = 10, nN = 8, m, M, trial = 5;
30   PetscViewer viewer;
31   PetscInt    bs[10], N[8];
32   char       *type[10];
33   PetscMPIInt size;
34   PetscBool   flg, cuda, maij = PETSC_FALSE, check = PETSC_FALSE, trans = PETSC_FALSE, convert = PETSC_FALSE, mkl;
35   char        file[PETSC_MAX_PATH_LEN];
36 
37   PetscFunctionBeginUser;
38   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
39   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
40   PetscCheck(size == 1, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "This is a uniprocessor example only");
41   PetscCall(PetscOptionsGetString(NULL, NULL, "-f", file, PETSC_MAX_PATH_LEN, &flg));
42   PetscCheck(flg, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must indicate binary file with the -f option");
43   PetscCall(PetscOptionsGetInt(NULL, NULL, "-trial", &trial, NULL));
44   PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-bs", bs, &nbs, &flg));
45   if (!flg) {
46     nbs   = 1;
47     bs[0] = 1;
48   }
49   PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-N", N, &nN, &flg));
50   if (!flg) {
51     nN   = 8;
52     N[0] = 1;
53     N[1] = 2;
54     N[2] = 4;
55     N[3] = 8;
56     N[4] = 16;
57     N[5] = 32;
58     N[6] = 64;
59     N[7] = 128;
60   }
61   PetscCall(PetscOptionsGetStringArray(NULL, NULL, "-type", type, &ntype, &flg));
62   if (!flg) {
63     ntype = 1;
64     PetscCall(PetscStrallocpy(MATSEQAIJ, &type[0]));
65   }
66   PetscCall(PetscOptionsGetBool(NULL, NULL, "-check", &check, NULL));
67   PetscCall(PetscOptionsGetBool(NULL, NULL, "-trans", &trans, NULL));
68   PetscCall(PetscOptionsGetBool(NULL, NULL, "-convert_aij", &convert, NULL));
69   for (PetscInt j = 0; j < nbs; ++j) {
70     PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, file, FILE_MODE_READ, &viewer));
71     PetscCall(MatCreate(PETSC_COMM_WORLD, &A));
72     PetscCall(MatSetFromOptions(A));
73     PetscCall(MatLoad(A, viewer));
74     PetscCall(PetscViewerDestroy(&viewer));
75     PetscCall(PetscObjectTypeCompareAny((PetscObject)A, &flg, MATSEQAIJ, MATMPIAIJ, ""));
76     PetscCheck(flg, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must indicate a MatAIJ input matrix");
77     PetscCall(MatGetSize(A, &m, &M));
78     if (m == M) {
79       Mat oA;
80       PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &oA));
81       PetscCall(MatAXPY(A, 1.0, oA, DIFFERENT_NONZERO_PATTERN));
82       PetscCall(MatDestroy(&oA));
83     }
84     PetscCall(MatGetLocalSize(A, &m, NULL));
85     PetscCall(MatGetSize(A, &M, NULL));
86     if (bs[j] > 1) {
87       Mat                T, Tt, B;
88       const PetscScalar *ptr;
89       PetscScalar       *val, *Aa;
90       const PetscInt    *Ai, *Aj;
91       PetscInt           An, i, k;
92       PetscBool          done;
93 
94       PetscCall(MatCreateDense(PETSC_COMM_SELF, bs[j], bs[j], bs[j], bs[j], NULL, &T));
95       PetscCall(MatSetRandom(T, NULL));
96       PetscCall(MatTranspose(T, MAT_INITIAL_MATRIX, &Tt));
97       PetscCall(MatAXPY(T, 1.0, Tt, SAME_NONZERO_PATTERN));
98       PetscCall(MatDestroy(&Tt));
99       PetscCall(MatDenseGetArrayRead(T, &ptr));
100       PetscCall(MatGetRowIJ(A, 0, PETSC_FALSE, PETSC_FALSE, &An, &Ai, &Aj, &done));
101       PetscCheck(done && An == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Inconsistent sizes");
102       PetscCall(MatSeqAIJGetArray(A, &Aa));
103       PetscCall(MatCreate(PETSC_COMM_WORLD, &B));
104       PetscCall(MatSetType(B, MATSEQBAIJ));
105       PetscCall(MatSetSizes(B, bs[j] * An, bs[j] * An, PETSC_DECIDE, PETSC_DECIDE));
106       PetscCall(PetscMalloc1(Ai[An] * bs[j] * bs[j], &val));
107       for (i = 0; i < Ai[An]; ++i)
108         for (k = 0; k < bs[j] * bs[j]; ++k) val[i * bs[j] * bs[j] + k] = Aa[i] * ptr[k];
109       PetscCall(MatSetOption(B, MAT_ROW_ORIENTED, PETSC_FALSE));
110       PetscCall(MatSeqBAIJSetPreallocationCSR(B, bs[j], Ai, Aj, val));
111       PetscCall(PetscFree(val));
112       PetscCall(MatSeqAIJRestoreArray(A, &Aa));
113       PetscCall(MatRestoreRowIJ(A, 0, PETSC_FALSE, PETSC_FALSE, &An, &Ai, &Aj, &done));
114       PetscCall(MatDenseRestoreArrayRead(T, &ptr));
115       PetscCall(MatDestroy(&T));
116       PetscCall(MatDestroy(&A));
117       A = B;
118     }
119     /* reconvert back to SeqAIJ before converting to the desired type later */
120     if (!convert) E = A;
121     PetscCall(MatConvert(A, MATSEQAIJ, convert ? MAT_INITIAL_MATRIX : MAT_INPLACE_MATRIX, &E));
122     PetscCall(MatSetOption(E, MAT_SYMMETRIC, PETSC_TRUE));
123     for (PetscInt i = 0; i < ntype; ++i) {
124       char        *tmp;
125       PetscInt    *ia_ptr, *ja_ptr, k;
126       PetscScalar *a_ptr;
127 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
128       struct matrix_descr descr;
129       sparse_matrix_t     spr;
130       descr.type = SPARSE_MATRIX_TYPE_GENERAL;
131       descr.diag = SPARSE_DIAG_NON_UNIT;
132 #endif
133       if (convert) PetscCall(MatDestroy(&A));
134       PetscCall(PetscStrstr(type[i], "mkl", &tmp));
135       if (tmp) {
136         size_t mlen, tlen;
137         char   base[256];
138 
139         mkl = PETSC_TRUE;
140         PetscCall(PetscStrlen(tmp, &mlen));
141         PetscCall(PetscStrlen(type[i], &tlen));
142         PetscCall(PetscStrncpy(base, type[i], tlen - mlen + 1));
143         PetscCall(MatConvert(E, base, convert ? MAT_INITIAL_MATRIX : MAT_INPLACE_MATRIX, &A));
144       } else {
145         mkl = PETSC_FALSE;
146         PetscCall(PetscStrstr(type[i], "maij", &tmp));
147         if (!tmp) {
148           PetscCall(MatConvert(E, type[i], convert ? MAT_INITIAL_MATRIX : MAT_INPLACE_MATRIX, &A));
149         } else {
150           PetscCall(MatConvert(E, MATAIJ, convert ? MAT_INITIAL_MATRIX : MAT_INPLACE_MATRIX, &A));
151           maij = PETSC_TRUE;
152         }
153       }
154       PetscCall(PetscObjectTypeCompareAny((PetscObject)A, &cuda, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, ""));
155       if (mkl) {
156         const PetscInt *Ai, *Aj;
157         PetscInt        An;
158         PetscBool       done;
159 
160         PetscCall(PetscObjectTypeCompareAny((PetscObject)A, &flg, MATSEQAIJ, MATSEQBAIJ, MATSEQSBAIJ, ""));
161         PetscCheck(flg, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Not implemented");
162         PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &flg));
163         PetscCall(MatGetRowIJ(A, 0, PETSC_FALSE, flg ? PETSC_FALSE : PETSC_TRUE, &An, &Ai, &Aj, &done));
164         PetscCheck(done, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Inconsistent sizes");
165         PetscCall(PetscMalloc1(An + 1, &ia_ptr));
166         PetscCall(PetscMalloc1(Ai[An], &ja_ptr));
167         if (flg) { /* SeqAIJ */
168           for (k = 0; k < An + 1; ++k) ia_ptr[k] = Ai[k];
169           for (k = 0; k < Ai[An]; ++k) ja_ptr[k] = Aj[k];
170           PetscCall(MatSeqAIJGetArray(A, &a_ptr));
171           PetscCallMKLSparse(mkl_sparse_d_create_csr, (&spr, SPARSE_INDEX_BASE_ZERO, An, An, ia_ptr, ia_ptr + 1, ja_ptr, a_ptr));
172         } else {
173           PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &flg));
174           if (flg) {
175             for (k = 0; k < An + 1; ++k) ia_ptr[k] = Ai[k] + 1; /* Fortran indexing to maximize cases covered by _mm routines */
176             for (k = 0; k < Ai[An]; ++k) ja_ptr[k] = Aj[k] + 1; /* Fortran indexing to maximize cases covered by _mm routines */
177             PetscCall(MatSeqBAIJGetArray(A, &a_ptr));
178             PetscCallMKLSparse(mkl_sparse_d_create_bsr, (&spr, SPARSE_INDEX_BASE_ONE, SPARSE_LAYOUT_COLUMN_MAJOR, An, An, bs[j], ia_ptr, ia_ptr + 1, ja_ptr, a_ptr));
179           } else {
180             PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &flg));
181             if (flg) {
182               for (k = 0; k < An + 1; ++k) ia_ptr[k] = Ai[k] + 1; /* Fortran indexing to maximize cases covered by _mm routines */
183               for (k = 0; k < Ai[An]; ++k) ja_ptr[k] = Aj[k] + 1; /* Fortran indexing to maximize cases covered by _mm routines */
184               PetscCall(MatSeqSBAIJGetArray(A, &a_ptr));
185               PetscCallMKLSparse(mkl_sparse_d_create_bsr, (&spr, SPARSE_INDEX_BASE_ONE, SPARSE_LAYOUT_COLUMN_MAJOR, An, An, bs[j], ia_ptr, ia_ptr + 1, ja_ptr, a_ptr));
186 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
187               descr.type = SPARSE_MATRIX_TYPE_SYMMETRIC;
188               descr.mode = SPARSE_FILL_MODE_UPPER;
189               descr.diag = SPARSE_DIAG_NON_UNIT;
190 #endif
191             }
192           }
193         }
194         PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &flg));
195         PetscCall(MatRestoreRowIJ(A, 0, PETSC_FALSE, flg ? PETSC_FALSE : PETSC_TRUE, &An, &Ai, &Aj, &done));
196       }
197 
198       PetscCall(MatViewFromOptions(A, NULL, "-A_view"));
199 
200       for (k = 0; k < nN; ++k) {
201         MatType  Atype, Ctype;
202         PetscInt AM, AN, CM, CN, t;
203 #if defined(PETSC_USE_LOG)
204         PetscLogStage stage, tstage;
205         char          stage_s[256];
206 #endif
207 
208         PetscCall(MatCreateDense(PETSC_COMM_WORLD, bs[j] * m, PETSC_DECIDE, bs[j] * M, N[k], NULL, &C));
209         PetscCall(MatCreateDense(PETSC_COMM_WORLD, bs[j] * m, PETSC_DECIDE, bs[j] * M, N[k], NULL, &D));
210         PetscCall(MatSetRandom(C, NULL));
211         if (cuda) { /* convert to GPU if needed */
212           PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
213           PetscCall(MatConvert(D, MATDENSECUDA, MAT_INPLACE_MATRIX, &D));
214         }
215         if (mkl) {
216           if (N[k] > 1) PetscCallMKLSparse(mkl_sparse_set_mm_hint, (spr, SPARSE_OPERATION_NON_TRANSPOSE, descr, SPARSE_LAYOUT_COLUMN_MAJOR, N[k], 1 + trial));
217           else PetscCallMKLSparse(mkl_sparse_set_mv_hint, (spr, SPARSE_OPERATION_NON_TRANSPOSE, descr, 1 + trial));
218           PetscCallMKLSparse(mkl_sparse_set_memory_hint, (spr, SPARSE_MEMORY_AGGRESSIVE));
219           PetscCallMKLSparse(mkl_sparse_optimize, (spr));
220         }
221         PetscCall(MatGetType(A, &Atype));
222         PetscCall(MatGetType(C, &Ctype));
223         PetscCall(MatGetSize(A, &AM, &AN));
224         PetscCall(MatGetSize(C, &CM, &CN));
225 
226 #if defined(PETSC_USE_LOG)
227         if (!maij || N[k] > 1) {
228           PetscCall(PetscSNPrintf(stage_s, sizeof(stage_s), "type_%s-bs_%" PetscInt_FMT "-N_%02d", type[i], bs[j], (int)N[k]));
229           PetscCall(PetscLogStageRegister(stage_s, &stage));
230         }
231         if (trans && N[k] > 1) {
232           PetscCall(PetscSNPrintf(stage_s, sizeof(stage_s), "trans_type_%s-bs_%" PetscInt_FMT "-N_%02d", type[i], bs[j], (int)N[k]));
233           PetscCall(PetscLogStageRegister(stage_s, &tstage));
234         }
235 #endif
236         /* A*B */
237         if (N[k] > 1) {
238           if (!maij) {
239             PetscCall(MatProductCreateWithMat(A, C, NULL, D));
240             PetscCall(MatProductSetType(D, MATPRODUCT_AB));
241             PetscCall(MatProductSetFromOptions(D));
242             PetscCall(MatProductSymbolic(D));
243           }
244 
245           if (!mkl) {
246             if (!maij) {
247               PetscCall(MatProductNumeric(D));
248               PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Benchmarking MatProduct %s: with A %s %" PetscInt_FMT "x%" PetscInt_FMT " and B %s %" PetscInt_FMT "x%" PetscInt_FMT "\n", MatProductTypes[MATPRODUCT_AB], Atype, AM, AN, Ctype, CM, CN));
249               PetscCall(PetscLogStagePush(stage));
250               for (t = 0; t < trial; ++t) PetscCall(MatProductNumeric(D));
251               PetscCall(PetscLogStagePop());
252             } else {
253               Mat                E, Ct, Dt;
254               Vec                cC, cD;
255               const PetscScalar *c_ptr;
256               PetscScalar       *d_ptr;
257               PetscCall(MatCreateMAIJ(A, N[k], &E));
258               PetscCall(MatDenseGetLocalMatrix(C, &Ct));
259               PetscCall(MatDenseGetLocalMatrix(D, &Dt));
260               PetscCall(MatTranspose(Ct, MAT_INPLACE_MATRIX, &Ct));
261               PetscCall(MatTranspose(Dt, MAT_INPLACE_MATRIX, &Dt));
262               PetscCall(MatDenseGetArrayRead(Ct, &c_ptr));
263               PetscCall(MatDenseGetArrayWrite(Dt, &d_ptr));
264               PetscCall(VecCreateMPIWithArray(PETSC_COMM_WORLD, 1, AM * N[k], PETSC_DECIDE, c_ptr, &cC));
265               PetscCall(VecCreateMPIWithArray(PETSC_COMM_WORLD, 1, AM * N[k], PETSC_DECIDE, d_ptr, &cD));
266               PetscCall(MatMult(E, cC, cD));
267               PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Benchmarking MatMult: with A %s %" PetscInt_FMT "x%" PetscInt_FMT " and B %s %" PetscInt_FMT "x%" PetscInt_FMT "\n", MATMAIJ, AM, AN, VECMPI, AM * N[k], 1));
268               PetscCall(PetscLogStagePush(stage));
269               for (t = 0; t < trial; ++t) PetscCall(MatMult(E, cC, cD));
270               PetscCall(PetscLogStagePop());
271               PetscCall(VecDestroy(&cD));
272               PetscCall(VecDestroy(&cC));
273               PetscCall(MatDestroy(&E));
274               PetscCall(MatDenseRestoreArrayWrite(Dt, &d_ptr));
275               PetscCall(MatDenseRestoreArrayRead(Ct, &c_ptr));
276               PetscCall(MatTranspose(Ct, MAT_INPLACE_MATRIX, &Ct));
277               PetscCall(MatTranspose(Dt, MAT_INPLACE_MATRIX, &Dt));
278             }
279           } else {
280             const PetscScalar *c_ptr;
281             PetscScalar       *d_ptr;
282 
283             PetscCall(MatDenseGetArrayRead(C, &c_ptr));
284             PetscCall(MatDenseGetArrayWrite(D, &d_ptr));
285             PetscCallMKLSparse(mkl_sparse_d_mm, (SPARSE_OPERATION_NON_TRANSPOSE, 1.0, spr, descr, SPARSE_LAYOUT_COLUMN_MAJOR, c_ptr, CN, CM, 0.0, d_ptr, CM));
286             PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Benchmarking mkl_sparse_d_mm (COLUMN_MAJOR): with A %s %" PetscInt_FMT "x%" PetscInt_FMT " and B %s %" PetscInt_FMT "x%" PetscInt_FMT "\n", Atype, AM, AN, Ctype, CM, CN));
287             PetscCall(PetscLogStagePush(stage));
288             for (t = 0; t < trial; ++t) PetscCallMKLSparse(mkl_sparse_d_mm, (SPARSE_OPERATION_NON_TRANSPOSE, 1.0, spr, descr, SPARSE_LAYOUT_COLUMN_MAJOR, c_ptr, CN, CM, 0.0, d_ptr, CM));
289             PetscCall(PetscLogStagePop());
290             PetscCall(MatDenseRestoreArrayWrite(D, &d_ptr));
291             PetscCall(MatDenseRestoreArrayRead(C, &c_ptr));
292           }
293         } else if (maij) {
294           PetscCall(MatDestroy(&C));
295           PetscCall(MatDestroy(&D));
296           continue;
297         } else if (!mkl) {
298           Vec cC, cD;
299 
300           PetscCall(MatDenseGetColumnVecRead(C, 0, &cC));
301           PetscCall(MatDenseGetColumnVecWrite(D, 0, &cD));
302           PetscCall(MatMult(A, cC, cD));
303           PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Benchmarking MatMult: with A %s %" PetscInt_FMT "x%" PetscInt_FMT "\n", Atype, AM, AN));
304           PetscCall(PetscLogStagePush(stage));
305           for (t = 0; t < trial; ++t) PetscCall(MatMult(A, cC, cD));
306           PetscCall(PetscLogStagePop());
307           PetscCall(MatDenseRestoreColumnVecRead(C, 0, &cC));
308           PetscCall(MatDenseRestoreColumnVecWrite(D, 0, &cD));
309         } else {
310           const PetscScalar *c_ptr;
311           PetscScalar       *d_ptr;
312 
313           PetscCall(MatDenseGetArrayRead(C, &c_ptr));
314           PetscCall(MatDenseGetArrayWrite(D, &d_ptr));
315           PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Benchmarking mkl_sparse_d_mv: with A %s %" PetscInt_FMT "x%" PetscInt_FMT "\n", Atype, AM, AN));
316           PetscCallMKLSparse(mkl_sparse_d_mv, (SPARSE_OPERATION_NON_TRANSPOSE, 1.0, spr, descr, c_ptr, 0.0, d_ptr));
317           PetscCall(PetscLogStagePush(stage));
318           for (t = 0; t < trial; ++t) PetscCallMKLSparse(mkl_sparse_d_mv, (SPARSE_OPERATION_NON_TRANSPOSE, 1.0, spr, descr, c_ptr, 0.0, d_ptr));
319           PetscCall(PetscLogStagePop());
320           PetscCall(MatDenseRestoreArrayWrite(D, &d_ptr));
321           PetscCall(MatDenseRestoreArrayRead(C, &c_ptr));
322         }
323 
324         if (check) {
325           PetscCall(MatMatMultEqual(A, C, D, 10, &flg));
326           if (!flg) {
327             MatType Dtype;
328 
329             PetscCall(MatGetType(D, &Dtype));
330             PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Error with A %s%s, C %s, D %s, Nk %" PetscInt_FMT "\n", Atype, mkl ? "mkl" : "", Ctype, Dtype, N[k]));
331           }
332         }
333 
334         /* MKL implementation seems buggy for ABt */
335         /* A*Bt */
336         if (!mkl && trans && N[k] > 1) {
337           PetscCall(PetscObjectTypeCompareAny((PetscObject)A, &flg, MATSEQAIJ, MATMPIAIJ, ""));
338           if (flg) {
339             PetscCall(MatTranspose(C, MAT_INPLACE_MATRIX, &C));
340             PetscCall(MatGetType(C, &Ctype));
341             if (!mkl) {
342               PetscCall(MatProductCreateWithMat(A, C, NULL, D));
343               PetscCall(MatProductSetType(D, MATPRODUCT_ABt));
344               PetscCall(MatProductSetFromOptions(D));
345               PetscCall(MatProductSymbolic(D));
346               PetscCall(MatProductNumeric(D));
347               PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Benchmarking MatProduct %s: with A %s %" PetscInt_FMT "x%" PetscInt_FMT " and Bt %s %" PetscInt_FMT "x%" PetscInt_FMT "\n", MatProductTypes[MATPRODUCT_ABt], Atype, AM, AN, Ctype, CM, CN));
348               PetscCall(PetscLogStagePush(tstage));
349               for (t = 0; t < trial; ++t) PetscCall(MatProductNumeric(D));
350               PetscCall(PetscLogStagePop());
351             } else {
352               const PetscScalar *c_ptr;
353               PetscScalar       *d_ptr;
354 
355               PetscCallMKLSparse(mkl_sparse_set_mm_hint, (spr, SPARSE_OPERATION_NON_TRANSPOSE, descr, SPARSE_LAYOUT_ROW_MAJOR, N[k], 1 + trial));
356               PetscCallMKLSparse(mkl_sparse_optimize, (spr));
357               PetscCall(MatDenseGetArrayRead(C, &c_ptr));
358               PetscCall(MatDenseGetArrayWrite(D, &d_ptr));
359               PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Benchmarking mkl_sparse_d_mm (ROW_MAJOR): with A %s %" PetscInt_FMT "x%" PetscInt_FMT " and B %s %" PetscInt_FMT "x%" PetscInt_FMT "\n", Atype, AM, AN, Ctype, CM, CN));
360               PetscCallMKLSparse(mkl_sparse_d_mm, (SPARSE_OPERATION_NON_TRANSPOSE, 1.0, spr, descr, SPARSE_LAYOUT_ROW_MAJOR, c_ptr, CN, CM, 0.0, d_ptr, CM));
361               PetscCall(PetscLogStagePush(stage));
362               for (t = 0; t < trial; ++t) PetscCallMKLSparse(mkl_sparse_d_mm, (SPARSE_OPERATION_NON_TRANSPOSE, 1.0, spr, descr, SPARSE_LAYOUT_ROW_MAJOR, c_ptr, CN, CM, 0.0, d_ptr, CM));
363               PetscCall(PetscLogStagePop());
364               PetscCall(MatDenseRestoreArrayWrite(D, &d_ptr));
365               PetscCall(MatDenseRestoreArrayRead(C, &c_ptr));
366             }
367           }
368         }
369 
370         if (!mkl && trans && N[k] > 1 && flg && check) {
371           PetscCall(MatMatTransposeMultEqual(A, C, D, 10, &flg));
372           if (!flg) {
373             MatType Dtype;
374             PetscCall(MatGetType(D, &Dtype));
375             PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Error with A %s%s, C %s, D %s, Nk %" PetscInt_FMT "\n", Atype, mkl ? "mkl" : "", Ctype, Dtype, N[k]));
376           }
377         }
378         PetscCall(MatDestroy(&C));
379         PetscCall(MatDestroy(&D));
380       }
381       if (mkl) {
382         PetscCallMKLSparse(mkl_sparse_destroy, (spr));
383         PetscCall(PetscFree(ia_ptr));
384         PetscCall(PetscFree(ja_ptr));
385       }
386       if (cuda && i != ntype - 1) {
387         PetscCall(PetscPrintf(PETSC_COMM_WORLD, "AIJCUSPARSE must be last, otherwise MatConvert() to another MatType is too slow\n"));
388         break;
389       }
390     }
391     if (E != A) PetscCall(MatDestroy(&E));
392     PetscCall(MatDestroy(&A));
393   }
394   for (m = 0; m < ntype; ++m) PetscCall(PetscFree(type[m]));
395   PetscCall(PetscFinalize());
396   return 0;
397 }
398 
399 /*TEST
400    build:
401      requires: double !complex !defined(PETSC_USE_64BIT_INDICES)
402 
403    testset:
404      nsize: 1
405      filter: sed "/Benchmarking/d"
406      args: -f ${wPETSC_DIR}/share/petsc/datafiles/matrices/spd-real-int32-float64 -bs 1,2,3 -N 1,2,18 -check -trans -convert_aij {{false true}shared output}
407      test:
408        suffix: basic
409        args: -type aij,sbaij,baij
410        output_file: output/ex237.out
411      test:
412        suffix: maij
413        args: -type aij,maij
414        output_file: output/ex237.out
415      test:
416        suffix: cuda
417        requires: cuda
418        args: -type aij,aijcusparse
419        output_file: output/ex237.out
420      test:
421        suffix: mkl
422        requires: mkl_sparse_optimize
423        args: -type aij,aijmkl,baijmkl,sbaijmkl
424        output_file: output/ex237.out
425 
426 TEST*/
427