xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision cb3ff29fa5c880872e59c11fa7fc2fbe1f738e0e)
1 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
2 #include <petsc/private/vecimpl.h>
3 #include <petsc/private/sfimpl.h>
4 #include <petsc/private/isimpl.h>
5 #include <petscblaslapack.h>
6 #include <petscsf.h>
7 #include <petsc/private/hashmapi.h>
8 
9 PetscErrorCode MatDestroy_MPIAIJ(Mat mat)
10 {
11   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
12 
13   PetscFunctionBegin;
14   PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
15   PetscCall(MatStashDestroy_Private(&mat->stash));
16   PetscCall(VecDestroy(&aij->diag));
17   PetscCall(MatDestroy(&aij->A));
18   PetscCall(MatDestroy(&aij->B));
19 #if defined(PETSC_USE_CTABLE)
20   PetscCall(PetscHMapIDestroy(&aij->colmap));
21 #else
22   PetscCall(PetscFree(aij->colmap));
23 #endif
24   PetscCall(PetscFree(aij->garray));
25   PetscCall(VecDestroy(&aij->lvec));
26   PetscCall(VecScatterDestroy(&aij->Mvctx));
27   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
28   PetscCall(PetscFree(aij->ld));
29 
30   PetscCall(PetscFree(mat->data));
31 
32   /* may be created by MatCreateMPIAIJSumSeqAIJSymbolic */
33   PetscCall(PetscObjectCompose((PetscObject)mat, "MatMergeSeqsToMPI", NULL));
34 
35   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));
36   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatStoreValues_C", NULL));
37   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatRetrieveValues_C", NULL));
38   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatIsTranspose_C", NULL));
39   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocation_C", NULL));
40   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetPreallocation_C", NULL));
41   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocationCSR_C", NULL));
42   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDiagonalScaleLocal_C", NULL));
43   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpibaij_C", NULL));
44   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisbaij_C", NULL));
45 #if defined(PETSC_HAVE_CUDA)
46   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcusparse_C", NULL));
47 #endif
48 #if defined(PETSC_HAVE_HIP)
49   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijhipsparse_C", NULL));
50 #endif
51 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
52   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijkokkos_C", NULL));
53 #endif
54   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", NULL));
55 #if defined(PETSC_HAVE_ELEMENTAL)
56   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_elemental_C", NULL));
57 #endif
58 #if defined(PETSC_HAVE_SCALAPACK)
59   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_scalapack_C", NULL));
60 #endif
61 #if defined(PETSC_HAVE_HYPRE)
62   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_hypre_C", NULL));
63   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", NULL));
64 #endif
65   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
66   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_is_mpiaij_C", NULL));
67   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpiaij_C", NULL));
68   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetUseScalableIncreaseOverlap_C", NULL));
69   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijperm_C", NULL));
70   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijsell_C", NULL));
71 #if defined(PETSC_HAVE_MKL_SPARSE)
72   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijmkl_C", NULL));
73 #endif
74   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcrl_C", NULL));
75   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
76   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisell_C", NULL));
77   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetPreallocationCOO_C", NULL));
78   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetValuesCOO_C", NULL));
79   PetscFunctionReturn(PETSC_SUCCESS);
80 }
81 
82 /* defines MatSetValues_MPI_Hash(), MatAssemblyBegin_MPI_Hash(), and  MatAssemblyEnd_MPI_Hash() */
83 #define TYPE AIJ
84 #define TYPE_AIJ
85 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
86 #undef TYPE
87 #undef TYPE_AIJ
88 
89 static PetscErrorCode MatGetRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
90 {
91   Mat B;
92 
93   PetscFunctionBegin;
94   PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &B));
95   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject)B));
96   PetscCall(MatGetRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
97   PetscCall(MatDestroy(&B));
98   PetscFunctionReturn(PETSC_SUCCESS);
99 }
100 
101 static PetscErrorCode MatRestoreRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
102 {
103   Mat B;
104 
105   PetscFunctionBegin;
106   PetscCall(PetscObjectQuery((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject *)&B));
107   PetscCall(MatRestoreRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
108   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", NULL));
109   PetscFunctionReturn(PETSC_SUCCESS);
110 }
111 
112 /*MC
113    MATAIJ - MATAIJ = "aij" - A matrix type to be used for sparse matrices.
114 
115    This matrix type is identical to` MATSEQAIJ` when constructed with a single process communicator,
116    and `MATMPIAIJ` otherwise.  As a result, for single process communicators,
117   `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
118   for communicators controlling multiple processes.  It is recommended that you call both of
119   the above preallocation routines for simplicity.
120 
121    Options Database Key:
122 . -mat_type aij - sets the matrix type to `MATAIJ` during a call to `MatSetFromOptions()`
123 
124   Developer Note:
125   Level: beginner
126 
127     Subclasses include `MATAIJCUSPARSE`, `MATAIJPERM`, `MATAIJSELL`, `MATAIJMKL`, `MATAIJCRL`, `MATAIJKOKKOS`,and also automatically switches over to use inodes when
128    enough exist.
129 
130 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateAIJ()`, `MatCreateSeqAIJ()`, `MATSEQAIJ`, `MATMPIAIJ`
131 M*/
132 
133 /*MC
134    MATAIJCRL - MATAIJCRL = "aijcrl" - A matrix type to be used for sparse matrices.
135 
136    This matrix type is identical to `MATSEQAIJCRL` when constructed with a single process communicator,
137    and `MATMPIAIJCRL` otherwise.  As a result, for single process communicators,
138    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
139   for communicators controlling multiple processes.  It is recommended that you call both of
140   the above preallocation routines for simplicity.
141 
142    Options Database Key:
143 . -mat_type aijcrl - sets the matrix type to `MATMPIAIJCRL` during a call to `MatSetFromOptions()`
144 
145   Level: beginner
146 
147 .seealso: [](ch_matrices), `Mat`, `MatCreateMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`
148 M*/
149 
150 static PetscErrorCode MatBindToCPU_MPIAIJ(Mat A, PetscBool flg)
151 {
152   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
153 
154   PetscFunctionBegin;
155 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_VIENNACL)
156   A->boundtocpu = flg;
157 #endif
158   if (a->A) PetscCall(MatBindToCPU(a->A, flg));
159   if (a->B) PetscCall(MatBindToCPU(a->B, flg));
160 
161   /* In addition to binding the diagonal and off-diagonal matrices, bind the local vectors used for matrix-vector products.
162    * This maybe seems a little odd for a MatBindToCPU() call to do, but it makes no sense for the binding of these vectors
163    * to differ from the parent matrix. */
164   if (a->lvec) PetscCall(VecBindToCPU(a->lvec, flg));
165   if (a->diag) PetscCall(VecBindToCPU(a->diag, flg));
166 
167   PetscFunctionReturn(PETSC_SUCCESS);
168 }
169 
170 static PetscErrorCode MatSetBlockSizes_MPIAIJ(Mat M, PetscInt rbs, PetscInt cbs)
171 {
172   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)M->data;
173 
174   PetscFunctionBegin;
175   if (mat->A) {
176     PetscCall(MatSetBlockSizes(mat->A, rbs, cbs));
177     PetscCall(MatSetBlockSizes(mat->B, rbs, 1));
178   }
179   PetscFunctionReturn(PETSC_SUCCESS);
180 }
181 
182 static PetscErrorCode MatFindNonzeroRows_MPIAIJ(Mat M, IS *keptrows)
183 {
184   Mat_MPIAIJ      *mat = (Mat_MPIAIJ *)M->data;
185   Mat_SeqAIJ      *a   = (Mat_SeqAIJ *)mat->A->data;
186   Mat_SeqAIJ      *b   = (Mat_SeqAIJ *)mat->B->data;
187   const PetscInt  *ia, *ib;
188   const MatScalar *aa, *bb, *aav, *bav;
189   PetscInt         na, nb, i, j, *rows, cnt = 0, n0rows;
190   PetscInt         m = M->rmap->n, rstart = M->rmap->rstart;
191 
192   PetscFunctionBegin;
193   *keptrows = NULL;
194 
195   ia = a->i;
196   ib = b->i;
197   PetscCall(MatSeqAIJGetArrayRead(mat->A, &aav));
198   PetscCall(MatSeqAIJGetArrayRead(mat->B, &bav));
199   for (i = 0; i < m; i++) {
200     na = ia[i + 1] - ia[i];
201     nb = ib[i + 1] - ib[i];
202     if (!na && !nb) {
203       cnt++;
204       goto ok1;
205     }
206     aa = aav + ia[i];
207     for (j = 0; j < na; j++) {
208       if (aa[j] != 0.0) goto ok1;
209     }
210     bb = bav ? bav + ib[i] : NULL;
211     for (j = 0; j < nb; j++) {
212       if (bb[j] != 0.0) goto ok1;
213     }
214     cnt++;
215   ok1:;
216   }
217   PetscCall(MPIU_Allreduce(&cnt, &n0rows, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)M)));
218   if (!n0rows) {
219     PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
220     PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
221     PetscFunctionReturn(PETSC_SUCCESS);
222   }
223   PetscCall(PetscMalloc1(M->rmap->n - cnt, &rows));
224   cnt = 0;
225   for (i = 0; i < m; i++) {
226     na = ia[i + 1] - ia[i];
227     nb = ib[i + 1] - ib[i];
228     if (!na && !nb) continue;
229     aa = aav + ia[i];
230     for (j = 0; j < na; j++) {
231       if (aa[j] != 0.0) {
232         rows[cnt++] = rstart + i;
233         goto ok2;
234       }
235     }
236     bb = bav ? bav + ib[i] : NULL;
237     for (j = 0; j < nb; j++) {
238       if (bb[j] != 0.0) {
239         rows[cnt++] = rstart + i;
240         goto ok2;
241       }
242     }
243   ok2:;
244   }
245   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), cnt, rows, PETSC_OWN_POINTER, keptrows));
246   PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
247   PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
248   PetscFunctionReturn(PETSC_SUCCESS);
249 }
250 
251 static PetscErrorCode MatDiagonalSet_MPIAIJ(Mat Y, Vec D, InsertMode is)
252 {
253   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)Y->data;
254   PetscBool   cong;
255 
256   PetscFunctionBegin;
257   PetscCall(MatHasCongruentLayouts(Y, &cong));
258   if (Y->assembled && cong) {
259     PetscCall(MatDiagonalSet(aij->A, D, is));
260   } else {
261     PetscCall(MatDiagonalSet_Default(Y, D, is));
262   }
263   PetscFunctionReturn(PETSC_SUCCESS);
264 }
265 
266 static PetscErrorCode MatFindZeroDiagonals_MPIAIJ(Mat M, IS *zrows)
267 {
268   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)M->data;
269   PetscInt    i, rstart, nrows, *rows;
270 
271   PetscFunctionBegin;
272   *zrows = NULL;
273   PetscCall(MatFindZeroDiagonals_SeqAIJ_Private(aij->A, &nrows, &rows));
274   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
275   for (i = 0; i < nrows; i++) rows[i] += rstart;
276   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), nrows, rows, PETSC_OWN_POINTER, zrows));
277   PetscFunctionReturn(PETSC_SUCCESS);
278 }
279 
280 static PetscErrorCode MatGetColumnReductions_MPIAIJ(Mat A, PetscInt type, PetscReal *reductions)
281 {
282   Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)A->data;
283   PetscInt           i, m, n, *garray = aij->garray;
284   Mat_SeqAIJ        *a_aij = (Mat_SeqAIJ *)aij->A->data;
285   Mat_SeqAIJ        *b_aij = (Mat_SeqAIJ *)aij->B->data;
286   PetscReal         *work;
287   const PetscScalar *dummy;
288 
289   PetscFunctionBegin;
290   PetscCall(MatGetSize(A, &m, &n));
291   PetscCall(PetscCalloc1(n, &work));
292   PetscCall(MatSeqAIJGetArrayRead(aij->A, &dummy));
293   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &dummy));
294   PetscCall(MatSeqAIJGetArrayRead(aij->B, &dummy));
295   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &dummy));
296   if (type == NORM_2) {
297     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i] * a_aij->a[i]);
298     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i] * b_aij->a[i]);
299   } else if (type == NORM_1) {
300     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i]);
301     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i]);
302   } else if (type == NORM_INFINITY) {
303     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] = PetscMax(PetscAbsScalar(a_aij->a[i]), work[A->cmap->rstart + a_aij->j[i]]);
304     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] = PetscMax(PetscAbsScalar(b_aij->a[i]), work[garray[b_aij->j[i]]]);
305   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
306     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscRealPart(a_aij->a[i]);
307     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscRealPart(b_aij->a[i]);
308   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
309     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscImaginaryPart(a_aij->a[i]);
310     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscImaginaryPart(b_aij->a[i]);
311   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
312   if (type == NORM_INFINITY) {
313     PetscCall(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)A)));
314   } else {
315     PetscCall(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
316   }
317   PetscCall(PetscFree(work));
318   if (type == NORM_2) {
319     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
320   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
321     for (i = 0; i < n; i++) reductions[i] /= m;
322   }
323   PetscFunctionReturn(PETSC_SUCCESS);
324 }
325 
326 static PetscErrorCode MatFindOffBlockDiagonalEntries_MPIAIJ(Mat A, IS *is)
327 {
328   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
329   IS              sis, gis;
330   const PetscInt *isis, *igis;
331   PetscInt        n, *iis, nsis, ngis, rstart, i;
332 
333   PetscFunctionBegin;
334   PetscCall(MatFindOffBlockDiagonalEntries(a->A, &sis));
335   PetscCall(MatFindNonzeroRows(a->B, &gis));
336   PetscCall(ISGetSize(gis, &ngis));
337   PetscCall(ISGetSize(sis, &nsis));
338   PetscCall(ISGetIndices(sis, &isis));
339   PetscCall(ISGetIndices(gis, &igis));
340 
341   PetscCall(PetscMalloc1(ngis + nsis, &iis));
342   PetscCall(PetscArraycpy(iis, igis, ngis));
343   PetscCall(PetscArraycpy(iis + ngis, isis, nsis));
344   n = ngis + nsis;
345   PetscCall(PetscSortRemoveDupsInt(&n, iis));
346   PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
347   for (i = 0; i < n; i++) iis[i] += rstart;
348   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)A), n, iis, PETSC_OWN_POINTER, is));
349 
350   PetscCall(ISRestoreIndices(sis, &isis));
351   PetscCall(ISRestoreIndices(gis, &igis));
352   PetscCall(ISDestroy(&sis));
353   PetscCall(ISDestroy(&gis));
354   PetscFunctionReturn(PETSC_SUCCESS);
355 }
356 
357 /*
358   Local utility routine that creates a mapping from the global column
359 number to the local number in the off-diagonal part of the local
360 storage of the matrix.  When PETSC_USE_CTABLE is used this is scalable at
361 a slightly higher hash table cost; without it it is not scalable (each processor
362 has an order N integer array but is fast to access.
363 */
364 PetscErrorCode MatCreateColmap_MPIAIJ_Private(Mat mat)
365 {
366   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
367   PetscInt    n   = aij->B->cmap->n, i;
368 
369   PetscFunctionBegin;
370   PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
371 #if defined(PETSC_USE_CTABLE)
372   PetscCall(PetscHMapICreateWithSize(n, &aij->colmap));
373   for (i = 0; i < n; i++) PetscCall(PetscHMapISet(aij->colmap, aij->garray[i] + 1, i + 1));
374 #else
375   PetscCall(PetscCalloc1(mat->cmap->N + 1, &aij->colmap));
376   for (i = 0; i < n; i++) aij->colmap[aij->garray[i]] = i + 1;
377 #endif
378   PetscFunctionReturn(PETSC_SUCCESS);
379 }
380 
381 #define MatSetValues_SeqAIJ_A_Private(row, col, value, addv, orow, ocol) \
382   do { \
383     if (col <= lastcol1) low1 = 0; \
384     else high1 = nrow1; \
385     lastcol1 = col; \
386     while (high1 - low1 > 5) { \
387       t = (low1 + high1) / 2; \
388       if (rp1[t] > col) high1 = t; \
389       else low1 = t; \
390     } \
391     for (_i = low1; _i < high1; _i++) { \
392       if (rp1[_i] > col) break; \
393       if (rp1[_i] == col) { \
394         if (addv == ADD_VALUES) { \
395           ap1[_i] += value; \
396           /* Not sure LogFlops will slow dow the code or not */ \
397           (void)PetscLogFlops(1.0); \
398         } else ap1[_i] = value; \
399         goto a_noinsert; \
400       } \
401     } \
402     if (value == 0.0 && ignorezeroentries && row != col) { \
403       low1  = 0; \
404       high1 = nrow1; \
405       goto a_noinsert; \
406     } \
407     if (nonew == 1) { \
408       low1  = 0; \
409       high1 = nrow1; \
410       goto a_noinsert; \
411     } \
412     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
413     MatSeqXAIJReallocateAIJ(A, am, 1, nrow1, row, col, rmax1, aa, ai, aj, rp1, ap1, aimax, nonew, MatScalar); \
414     N = nrow1++ - 1; \
415     a->nz++; \
416     high1++; \
417     /* shift up all the later entries in this row */ \
418     PetscCall(PetscArraymove(rp1 + _i + 1, rp1 + _i, N - _i + 1)); \
419     PetscCall(PetscArraymove(ap1 + _i + 1, ap1 + _i, N - _i + 1)); \
420     rp1[_i] = col; \
421     ap1[_i] = value; \
422     A->nonzerostate++; \
423   a_noinsert:; \
424     ailen[row] = nrow1; \
425   } while (0)
426 
427 #define MatSetValues_SeqAIJ_B_Private(row, col, value, addv, orow, ocol) \
428   do { \
429     if (col <= lastcol2) low2 = 0; \
430     else high2 = nrow2; \
431     lastcol2 = col; \
432     while (high2 - low2 > 5) { \
433       t = (low2 + high2) / 2; \
434       if (rp2[t] > col) high2 = t; \
435       else low2 = t; \
436     } \
437     for (_i = low2; _i < high2; _i++) { \
438       if (rp2[_i] > col) break; \
439       if (rp2[_i] == col) { \
440         if (addv == ADD_VALUES) { \
441           ap2[_i] += value; \
442           (void)PetscLogFlops(1.0); \
443         } else ap2[_i] = value; \
444         goto b_noinsert; \
445       } \
446     } \
447     if (value == 0.0 && ignorezeroentries) { \
448       low2  = 0; \
449       high2 = nrow2; \
450       goto b_noinsert; \
451     } \
452     if (nonew == 1) { \
453       low2  = 0; \
454       high2 = nrow2; \
455       goto b_noinsert; \
456     } \
457     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
458     MatSeqXAIJReallocateAIJ(B, bm, 1, nrow2, row, col, rmax2, ba, bi, bj, rp2, ap2, bimax, nonew, MatScalar); \
459     N = nrow2++ - 1; \
460     b->nz++; \
461     high2++; \
462     /* shift up all the later entries in this row */ \
463     PetscCall(PetscArraymove(rp2 + _i + 1, rp2 + _i, N - _i + 1)); \
464     PetscCall(PetscArraymove(ap2 + _i + 1, ap2 + _i, N - _i + 1)); \
465     rp2[_i] = col; \
466     ap2[_i] = value; \
467     B->nonzerostate++; \
468   b_noinsert:; \
469     bilen[row] = nrow2; \
470   } while (0)
471 
472 static PetscErrorCode MatSetValuesRow_MPIAIJ(Mat A, PetscInt row, const PetscScalar v[])
473 {
474   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)A->data;
475   Mat_SeqAIJ  *a = (Mat_SeqAIJ *)mat->A->data, *b = (Mat_SeqAIJ *)mat->B->data;
476   PetscInt     l, *garray                         = mat->garray, diag;
477   PetscScalar *aa, *ba;
478 
479   PetscFunctionBegin;
480   /* code only works for square matrices A */
481 
482   /* find size of row to the left of the diagonal part */
483   PetscCall(MatGetOwnershipRange(A, &diag, NULL));
484   row = row - diag;
485   for (l = 0; l < b->i[row + 1] - b->i[row]; l++) {
486     if (garray[b->j[b->i[row] + l]] > diag) break;
487   }
488   if (l) {
489     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
490     PetscCall(PetscArraycpy(ba + b->i[row], v, l));
491     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
492   }
493 
494   /* diagonal part */
495   if (a->i[row + 1] - a->i[row]) {
496     PetscCall(MatSeqAIJGetArray(mat->A, &aa));
497     PetscCall(PetscArraycpy(aa + a->i[row], v + l, (a->i[row + 1] - a->i[row])));
498     PetscCall(MatSeqAIJRestoreArray(mat->A, &aa));
499   }
500 
501   /* right of diagonal part */
502   if (b->i[row + 1] - b->i[row] - l) {
503     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
504     PetscCall(PetscArraycpy(ba + b->i[row] + l, v + l + a->i[row + 1] - a->i[row], b->i[row + 1] - b->i[row] - l));
505     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
506   }
507   PetscFunctionReturn(PETSC_SUCCESS);
508 }
509 
510 PetscErrorCode MatSetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode addv)
511 {
512   Mat_MPIAIJ *aij   = (Mat_MPIAIJ *)mat->data;
513   PetscScalar value = 0.0;
514   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
515   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
516   PetscBool   roworiented = aij->roworiented;
517 
518   /* Some Variables required in the macro */
519   Mat         A     = aij->A;
520   Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
521   PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
522   PetscBool   ignorezeroentries = a->ignorezeroentries;
523   Mat         B                 = aij->B;
524   Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
525   PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
526   MatScalar  *aa, *ba;
527   PetscInt   *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
528   PetscInt    nonew;
529   MatScalar  *ap1, *ap2;
530 
531   PetscFunctionBegin;
532   PetscCall(MatSeqAIJGetArray(A, &aa));
533   PetscCall(MatSeqAIJGetArray(B, &ba));
534   for (i = 0; i < m; i++) {
535     if (im[i] < 0) continue;
536     PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
537     if (im[i] >= rstart && im[i] < rend) {
538       row      = im[i] - rstart;
539       lastcol1 = -1;
540       rp1      = aj ? aj + ai[row] : NULL;
541       ap1      = aa ? aa + ai[row] : NULL;
542       rmax1    = aimax[row];
543       nrow1    = ailen[row];
544       low1     = 0;
545       high1    = nrow1;
546       lastcol2 = -1;
547       rp2      = bj ? bj + bi[row] : NULL;
548       ap2      = ba ? ba + bi[row] : NULL;
549       rmax2    = bimax[row];
550       nrow2    = bilen[row];
551       low2     = 0;
552       high2    = nrow2;
553 
554       for (j = 0; j < n; j++) {
555         if (v) value = roworiented ? v[i * n + j] : v[i + j * m];
556         if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
557         if (in[j] >= cstart && in[j] < cend) {
558           col   = in[j] - cstart;
559           nonew = a->nonew;
560           MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
561         } else if (in[j] < 0) {
562           continue;
563         } else {
564           PetscCheck(in[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
565           if (mat->was_assembled) {
566             if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
567 #if defined(PETSC_USE_CTABLE)
568             PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col)); /* map global col ids to local ones */
569             col--;
570 #else
571             col = aij->colmap[in[j]] - 1;
572 #endif
573             if (col < 0 && !((Mat_SeqAIJ *)(aij->B->data))->nonew) { /* col < 0 means in[j] is a new col for B */
574               PetscCall(MatDisAssemble_MPIAIJ(mat));                 /* Change aij->B from reduced/local format to expanded/global format */
575               col = in[j];
576               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
577               B     = aij->B;
578               b     = (Mat_SeqAIJ *)B->data;
579               bimax = b->imax;
580               bi    = b->i;
581               bilen = b->ilen;
582               bj    = b->j;
583               ba    = b->a;
584               rp2   = bj + bi[row];
585               ap2   = ba + bi[row];
586               rmax2 = bimax[row];
587               nrow2 = bilen[row];
588               low2  = 0;
589               high2 = nrow2;
590               bm    = aij->B->rmap->n;
591               ba    = b->a;
592             } else if (col < 0 && !(ignorezeroentries && value == 0.0)) {
593               if (1 == ((Mat_SeqAIJ *)(aij->B->data))->nonew) {
594                 PetscCall(PetscInfo(mat, "Skipping of insertion of new nonzero location in off-diagonal portion of matrix %g(%" PetscInt_FMT ",%" PetscInt_FMT ")\n", (double)PetscRealPart(value), im[i], in[j]));
595               } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", im[i], in[j]);
596             }
597           } else col = in[j];
598           nonew = b->nonew;
599           MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
600         }
601       }
602     } else {
603       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Setting off process row %" PetscInt_FMT " even though MatSetOption(,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) was set", im[i]);
604       if (!aij->donotstash) {
605         mat->assembled = PETSC_FALSE;
606         if (roworiented) {
607           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, v ? v + i * n : NULL, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
608         } else {
609           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, v ? v + i : NULL, m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
610         }
611       }
612     }
613   }
614   PetscCall(MatSeqAIJRestoreArray(A, &aa)); /* aa, bb might have been free'd due to reallocation above. But we don't access them here */
615   PetscCall(MatSeqAIJRestoreArray(B, &ba));
616   PetscFunctionReturn(PETSC_SUCCESS);
617 }
618 
619 /*
620     This function sets the j and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
621     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
622     No off-processor parts off the matrix are allowed here and mat->was_assembled has to be PETSC_FALSE.
623 */
624 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[])
625 {
626   Mat_MPIAIJ *aij    = (Mat_MPIAIJ *)mat->data;
627   Mat         A      = aij->A; /* diagonal part of the matrix */
628   Mat         B      = aij->B; /* off-diagonal part of the matrix */
629   Mat_SeqAIJ *a      = (Mat_SeqAIJ *)A->data;
630   Mat_SeqAIJ *b      = (Mat_SeqAIJ *)B->data;
631   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, col;
632   PetscInt   *ailen = a->ilen, *aj = a->j;
633   PetscInt   *bilen = b->ilen, *bj = b->j;
634   PetscInt    am          = aij->A->rmap->n, j;
635   PetscInt    diag_so_far = 0, dnz;
636   PetscInt    offd_so_far = 0, onz;
637 
638   PetscFunctionBegin;
639   /* Iterate over all rows of the matrix */
640   for (j = 0; j < am; j++) {
641     dnz = onz = 0;
642     /*  Iterate over all non-zero columns of the current row */
643     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
644       /* If column is in the diagonal */
645       if (mat_j[col] >= cstart && mat_j[col] < cend) {
646         aj[diag_so_far++] = mat_j[col] - cstart;
647         dnz++;
648       } else { /* off-diagonal entries */
649         bj[offd_so_far++] = mat_j[col];
650         onz++;
651       }
652     }
653     ailen[j] = dnz;
654     bilen[j] = onz;
655   }
656   PetscFunctionReturn(PETSC_SUCCESS);
657 }
658 
659 /*
660     This function sets the local j, a and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
661     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
662     No off-processor parts off the matrix are allowed here, they are set at a later point by MatSetValues_MPIAIJ.
663     Also, mat->was_assembled has to be false, otherwise the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart;
664     would not be true and the more complex MatSetValues_MPIAIJ has to be used.
665 */
666 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[], const PetscScalar mat_a[])
667 {
668   Mat_MPIAIJ  *aij  = (Mat_MPIAIJ *)mat->data;
669   Mat          A    = aij->A; /* diagonal part of the matrix */
670   Mat          B    = aij->B; /* off-diagonal part of the matrix */
671   Mat_SeqAIJ  *aijd = (Mat_SeqAIJ *)(aij->A)->data, *aijo = (Mat_SeqAIJ *)(aij->B)->data;
672   Mat_SeqAIJ  *a      = (Mat_SeqAIJ *)A->data;
673   Mat_SeqAIJ  *b      = (Mat_SeqAIJ *)B->data;
674   PetscInt     cstart = mat->cmap->rstart, cend = mat->cmap->rend;
675   PetscInt    *ailen = a->ilen, *aj = a->j;
676   PetscInt    *bilen = b->ilen, *bj = b->j;
677   PetscInt     am          = aij->A->rmap->n, j;
678   PetscInt    *full_diag_i = aijd->i, *full_offd_i = aijo->i; /* These variables can also include non-local elements, which are set at a later point. */
679   PetscInt     col, dnz_row, onz_row, rowstart_diag, rowstart_offd;
680   PetscScalar *aa = a->a, *ba = b->a;
681 
682   PetscFunctionBegin;
683   /* Iterate over all rows of the matrix */
684   for (j = 0; j < am; j++) {
685     dnz_row = onz_row = 0;
686     rowstart_offd     = full_offd_i[j];
687     rowstart_diag     = full_diag_i[j];
688     /*  Iterate over all non-zero columns of the current row */
689     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
690       /* If column is in the diagonal */
691       if (mat_j[col] >= cstart && mat_j[col] < cend) {
692         aj[rowstart_diag + dnz_row] = mat_j[col] - cstart;
693         aa[rowstart_diag + dnz_row] = mat_a[col];
694         dnz_row++;
695       } else { /* off-diagonal entries */
696         bj[rowstart_offd + onz_row] = mat_j[col];
697         ba[rowstart_offd + onz_row] = mat_a[col];
698         onz_row++;
699       }
700     }
701     ailen[j] = dnz_row;
702     bilen[j] = onz_row;
703   }
704   PetscFunctionReturn(PETSC_SUCCESS);
705 }
706 
707 static PetscErrorCode MatGetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
708 {
709   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
710   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
711   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
712 
713   PetscFunctionBegin;
714   for (i = 0; i < m; i++) {
715     if (idxm[i] < 0) continue; /* negative row */
716     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, idxm[i], mat->rmap->N - 1);
717     PetscCheck(idxm[i] >= rstart && idxm[i] < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported, row requested %" PetscInt_FMT " range [%" PetscInt_FMT " %" PetscInt_FMT ")", idxm[i], rstart, rend);
718     row = idxm[i] - rstart;
719     for (j = 0; j < n; j++) {
720       if (idxn[j] < 0) continue; /* negative column */
721       PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, idxn[j], mat->cmap->N - 1);
722       if (idxn[j] >= cstart && idxn[j] < cend) {
723         col = idxn[j] - cstart;
724         PetscCall(MatGetValues(aij->A, 1, &row, 1, &col, v + i * n + j));
725       } else {
726         if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
727 #if defined(PETSC_USE_CTABLE)
728         PetscCall(PetscHMapIGetWithDefault(aij->colmap, idxn[j] + 1, 0, &col));
729         col--;
730 #else
731         col = aij->colmap[idxn[j]] - 1;
732 #endif
733         if ((col < 0) || (aij->garray[col] != idxn[j])) *(v + i * n + j) = 0.0;
734         else PetscCall(MatGetValues(aij->B, 1, &row, 1, &col, v + i * n + j));
735       }
736     }
737   }
738   PetscFunctionReturn(PETSC_SUCCESS);
739 }
740 
741 static PetscErrorCode MatAssemblyBegin_MPIAIJ(Mat mat, MatAssemblyType mode)
742 {
743   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
744   PetscInt    nstash, reallocs;
745 
746   PetscFunctionBegin;
747   if (aij->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);
748 
749   PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
750   PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
751   PetscCall(PetscInfo(aij->A, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
752   PetscFunctionReturn(PETSC_SUCCESS);
753 }
754 
755 PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat mat, MatAssemblyType mode)
756 {
757   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *)mat->data;
758   PetscMPIInt  n;
759   PetscInt     i, j, rstart, ncols, flg;
760   PetscInt    *row, *col;
761   PetscBool    other_disassembled;
762   PetscScalar *val;
763 
764   /* do not use 'b = (Mat_SeqAIJ*)aij->B->data' as B can be reset in disassembly */
765 
766   PetscFunctionBegin;
767   if (!aij->donotstash && !mat->nooffprocentries) {
768     while (1) {
769       PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
770       if (!flg) break;
771 
772       for (i = 0; i < n;) {
773         /* Now identify the consecutive vals belonging to the same row */
774         for (j = i, rstart = row[j]; j < n; j++) {
775           if (row[j] != rstart) break;
776         }
777         if (j < n) ncols = j - i;
778         else ncols = n - i;
779         /* Now assemble all these values with a single function call */
780         PetscCall(MatSetValues_MPIAIJ(mat, 1, row + i, ncols, col + i, val + i, mat->insertmode));
781         i = j;
782       }
783     }
784     PetscCall(MatStashScatterEnd_Private(&mat->stash));
785   }
786 #if defined(PETSC_HAVE_DEVICE)
787   if (mat->offloadmask == PETSC_OFFLOAD_CPU) aij->A->offloadmask = PETSC_OFFLOAD_CPU;
788   /* We call MatBindToCPU() on aij->A and aij->B here, because if MatBindToCPU_MPIAIJ() is called before assembly, it cannot bind these. */
789   if (mat->boundtocpu) {
790     PetscCall(MatBindToCPU(aij->A, PETSC_TRUE));
791     PetscCall(MatBindToCPU(aij->B, PETSC_TRUE));
792   }
793 #endif
794   PetscCall(MatAssemblyBegin(aij->A, mode));
795   PetscCall(MatAssemblyEnd(aij->A, mode));
796 
797   /* determine if any processor has disassembled, if so we must
798      also disassemble ourself, in order that we may reassemble. */
799   /*
800      if nonzero structure of submatrix B cannot change then we know that
801      no processor disassembled thus we can skip this stuff
802   */
803   if (!((Mat_SeqAIJ *)aij->B->data)->nonew) {
804     PetscCall(MPIU_Allreduce(&mat->was_assembled, &other_disassembled, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
805     if (mat->was_assembled && !other_disassembled) { /* mat on this rank has reduced off-diag B with local col ids, but globally it does not */
806       PetscCall(MatDisAssemble_MPIAIJ(mat));
807     }
808   }
809   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(MatSetUpMultiply_MPIAIJ(mat));
810   PetscCall(MatSetOption(aij->B, MAT_USE_INODES, PETSC_FALSE));
811 #if defined(PETSC_HAVE_DEVICE)
812   if (mat->offloadmask == PETSC_OFFLOAD_CPU && aij->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) aij->B->offloadmask = PETSC_OFFLOAD_CPU;
813 #endif
814   PetscCall(MatAssemblyBegin(aij->B, mode));
815   PetscCall(MatAssemblyEnd(aij->B, mode));
816 
817   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
818 
819   aij->rowvalues = NULL;
820 
821   PetscCall(VecDestroy(&aij->diag));
822 
823   /* if no new nonzero locations are allowed in matrix then only set the matrix state the first time through */
824   if ((!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) || !((Mat_SeqAIJ *)(aij->A->data))->nonew) {
825     PetscObjectState state = aij->A->nonzerostate + aij->B->nonzerostate;
826     PetscCall(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
827   }
828 #if defined(PETSC_HAVE_DEVICE)
829   mat->offloadmask = PETSC_OFFLOAD_BOTH;
830 #endif
831   PetscFunctionReturn(PETSC_SUCCESS);
832 }
833 
834 static PetscErrorCode MatZeroEntries_MPIAIJ(Mat A)
835 {
836   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
837 
838   PetscFunctionBegin;
839   PetscCall(MatZeroEntries(l->A));
840   PetscCall(MatZeroEntries(l->B));
841   PetscFunctionReturn(PETSC_SUCCESS);
842 }
843 
844 static PetscErrorCode MatZeroRows_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
845 {
846   Mat_MPIAIJ      *mat = (Mat_MPIAIJ *)A->data;
847   PetscObjectState sA, sB;
848   PetscInt        *lrows;
849   PetscInt         r, len;
850   PetscBool        cong, lch, gch;
851 
852   PetscFunctionBegin;
853   /* get locally owned rows */
854   PetscCall(MatZeroRowsMapLocal_Private(A, N, rows, &len, &lrows));
855   PetscCall(MatHasCongruentLayouts(A, &cong));
856   /* fix right hand side if needed */
857   if (x && b) {
858     const PetscScalar *xx;
859     PetscScalar       *bb;
860 
861     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
862     PetscCall(VecGetArrayRead(x, &xx));
863     PetscCall(VecGetArray(b, &bb));
864     for (r = 0; r < len; ++r) bb[lrows[r]] = diag * xx[lrows[r]];
865     PetscCall(VecRestoreArrayRead(x, &xx));
866     PetscCall(VecRestoreArray(b, &bb));
867   }
868 
869   sA = mat->A->nonzerostate;
870   sB = mat->B->nonzerostate;
871 
872   if (diag != 0.0 && cong) {
873     PetscCall(MatZeroRows(mat->A, len, lrows, diag, NULL, NULL));
874     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
875   } else if (diag != 0.0) { /* non-square or non congruent layouts -> if keepnonzeropattern is false, we allow for new insertion */
876     Mat_SeqAIJ *aijA = (Mat_SeqAIJ *)mat->A->data;
877     Mat_SeqAIJ *aijB = (Mat_SeqAIJ *)mat->B->data;
878     PetscInt    nnwA, nnwB;
879     PetscBool   nnzA, nnzB;
880 
881     nnwA = aijA->nonew;
882     nnwB = aijB->nonew;
883     nnzA = aijA->keepnonzeropattern;
884     nnzB = aijB->keepnonzeropattern;
885     if (!nnzA) {
886       PetscCall(PetscInfo(mat->A, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on diagonal block.\n"));
887       aijA->nonew = 0;
888     }
889     if (!nnzB) {
890       PetscCall(PetscInfo(mat->B, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on off-diagonal block.\n"));
891       aijB->nonew = 0;
892     }
893     /* Must zero here before the next loop */
894     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
895     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
896     for (r = 0; r < len; ++r) {
897       const PetscInt row = lrows[r] + A->rmap->rstart;
898       if (row >= A->cmap->N) continue;
899       PetscCall(MatSetValues(A, 1, &row, 1, &row, &diag, INSERT_VALUES));
900     }
901     aijA->nonew = nnwA;
902     aijB->nonew = nnwB;
903   } else {
904     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
905     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
906   }
907   PetscCall(PetscFree(lrows));
908   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
909   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
910 
911   /* reduce nonzerostate */
912   lch = (PetscBool)(sA != mat->A->nonzerostate || sB != mat->B->nonzerostate);
913   PetscCall(MPIU_Allreduce(&lch, &gch, 1, MPIU_BOOL, MPI_LOR, PetscObjectComm((PetscObject)A)));
914   if (gch) A->nonzerostate++;
915   PetscFunctionReturn(PETSC_SUCCESS);
916 }
917 
918 static PetscErrorCode MatZeroRowsColumns_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
919 {
920   Mat_MPIAIJ        *l = (Mat_MPIAIJ *)A->data;
921   PetscMPIInt        n = A->rmap->n;
922   PetscInt           i, j, r, m, len = 0;
923   PetscInt          *lrows, *owners = A->rmap->range;
924   PetscMPIInt        p = 0;
925   PetscSFNode       *rrows;
926   PetscSF            sf;
927   const PetscScalar *xx;
928   PetscScalar       *bb, *mask, *aij_a;
929   Vec                xmask, lmask;
930   Mat_SeqAIJ        *aij = (Mat_SeqAIJ *)l->B->data;
931   const PetscInt    *aj, *ii, *ridx;
932   PetscScalar       *aa;
933 
934   PetscFunctionBegin;
935   /* Create SF where leaves are input rows and roots are owned rows */
936   PetscCall(PetscMalloc1(n, &lrows));
937   for (r = 0; r < n; ++r) lrows[r] = -1;
938   PetscCall(PetscMalloc1(N, &rrows));
939   for (r = 0; r < N; ++r) {
940     const PetscInt idx = rows[r];
941     PetscCheck(idx >= 0 && A->rmap->N > idx, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range [0,%" PetscInt_FMT ")", idx, A->rmap->N);
942     if (idx < owners[p] || owners[p + 1] <= idx) { /* short-circuit the search if the last p owns this row too */
943       PetscCall(PetscLayoutFindOwner(A->rmap, idx, &p));
944     }
945     rrows[r].rank  = p;
946     rrows[r].index = rows[r] - owners[p];
947   }
948   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
949   PetscCall(PetscSFSetGraph(sf, n, N, NULL, PETSC_OWN_POINTER, rrows, PETSC_OWN_POINTER));
950   /* Collect flags for rows to be zeroed */
951   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
952   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
953   PetscCall(PetscSFDestroy(&sf));
954   /* Compress and put in row numbers */
955   for (r = 0; r < n; ++r)
956     if (lrows[r] >= 0) lrows[len++] = r;
957   /* zero diagonal part of matrix */
958   PetscCall(MatZeroRowsColumns(l->A, len, lrows, diag, x, b));
959   /* handle off-diagonal part of matrix */
960   PetscCall(MatCreateVecs(A, &xmask, NULL));
961   PetscCall(VecDuplicate(l->lvec, &lmask));
962   PetscCall(VecGetArray(xmask, &bb));
963   for (i = 0; i < len; i++) bb[lrows[i]] = 1;
964   PetscCall(VecRestoreArray(xmask, &bb));
965   PetscCall(VecScatterBegin(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
966   PetscCall(VecScatterEnd(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
967   PetscCall(VecDestroy(&xmask));
968   if (x && b) { /* this code is buggy when the row and column layout don't match */
969     PetscBool cong;
970 
971     PetscCall(MatHasCongruentLayouts(A, &cong));
972     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
973     PetscCall(VecScatterBegin(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
974     PetscCall(VecScatterEnd(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
975     PetscCall(VecGetArrayRead(l->lvec, &xx));
976     PetscCall(VecGetArray(b, &bb));
977   }
978   PetscCall(VecGetArray(lmask, &mask));
979   /* remove zeroed rows of off-diagonal matrix */
980   PetscCall(MatSeqAIJGetArray(l->B, &aij_a));
981   ii = aij->i;
982   for (i = 0; i < len; i++) PetscCall(PetscArrayzero(aij_a + ii[lrows[i]], ii[lrows[i] + 1] - ii[lrows[i]]));
983   /* loop over all elements of off process part of matrix zeroing removed columns*/
984   if (aij->compressedrow.use) {
985     m    = aij->compressedrow.nrows;
986     ii   = aij->compressedrow.i;
987     ridx = aij->compressedrow.rindex;
988     for (i = 0; i < m; i++) {
989       n  = ii[i + 1] - ii[i];
990       aj = aij->j + ii[i];
991       aa = aij_a + ii[i];
992 
993       for (j = 0; j < n; j++) {
994         if (PetscAbsScalar(mask[*aj])) {
995           if (b) bb[*ridx] -= *aa * xx[*aj];
996           *aa = 0.0;
997         }
998         aa++;
999         aj++;
1000       }
1001       ridx++;
1002     }
1003   } else { /* do not use compressed row format */
1004     m = l->B->rmap->n;
1005     for (i = 0; i < m; i++) {
1006       n  = ii[i + 1] - ii[i];
1007       aj = aij->j + ii[i];
1008       aa = aij_a + ii[i];
1009       for (j = 0; j < n; j++) {
1010         if (PetscAbsScalar(mask[*aj])) {
1011           if (b) bb[i] -= *aa * xx[*aj];
1012           *aa = 0.0;
1013         }
1014         aa++;
1015         aj++;
1016       }
1017     }
1018   }
1019   if (x && b) {
1020     PetscCall(VecRestoreArray(b, &bb));
1021     PetscCall(VecRestoreArrayRead(l->lvec, &xx));
1022   }
1023   PetscCall(MatSeqAIJRestoreArray(l->B, &aij_a));
1024   PetscCall(VecRestoreArray(lmask, &mask));
1025   PetscCall(VecDestroy(&lmask));
1026   PetscCall(PetscFree(lrows));
1027 
1028   /* only change matrix nonzero state if pattern was allowed to be changed */
1029   if (!((Mat_SeqAIJ *)(l->A->data))->keepnonzeropattern) {
1030     PetscObjectState state = l->A->nonzerostate + l->B->nonzerostate;
1031     PetscCall(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
1032   }
1033   PetscFunctionReturn(PETSC_SUCCESS);
1034 }
1035 
1036 static PetscErrorCode MatMult_MPIAIJ(Mat A, Vec xx, Vec yy)
1037 {
1038   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1039   PetscInt    nt;
1040   VecScatter  Mvctx = a->Mvctx;
1041 
1042   PetscFunctionBegin;
1043   PetscCall(VecGetLocalSize(xx, &nt));
1044   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
1045   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1046   PetscUseTypeMethod(a->A, mult, xx, yy);
1047   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1048   PetscUseTypeMethod(a->B, multadd, a->lvec, yy, yy);
1049   PetscFunctionReturn(PETSC_SUCCESS);
1050 }
1051 
1052 static PetscErrorCode MatMultDiagonalBlock_MPIAIJ(Mat A, Vec bb, Vec xx)
1053 {
1054   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1055 
1056   PetscFunctionBegin;
1057   PetscCall(MatMultDiagonalBlock(a->A, bb, xx));
1058   PetscFunctionReturn(PETSC_SUCCESS);
1059 }
1060 
1061 static PetscErrorCode MatMultAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1062 {
1063   Mat_MPIAIJ *a     = (Mat_MPIAIJ *)A->data;
1064   VecScatter  Mvctx = a->Mvctx;
1065 
1066   PetscFunctionBegin;
1067   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1068   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
1069   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1070   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
1071   PetscFunctionReturn(PETSC_SUCCESS);
1072 }
1073 
1074 static PetscErrorCode MatMultTranspose_MPIAIJ(Mat A, Vec xx, Vec yy)
1075 {
1076   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1077 
1078   PetscFunctionBegin;
1079   /* do nondiagonal part */
1080   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1081   /* do local part */
1082   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
1083   /* add partial results together */
1084   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1085   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1086   PetscFunctionReturn(PETSC_SUCCESS);
1087 }
1088 
1089 static PetscErrorCode MatIsTranspose_MPIAIJ(Mat Amat, Mat Bmat, PetscReal tol, PetscBool *f)
1090 {
1091   MPI_Comm    comm;
1092   Mat_MPIAIJ *Aij = (Mat_MPIAIJ *)Amat->data, *Bij = (Mat_MPIAIJ *)Bmat->data;
1093   Mat         Adia = Aij->A, Bdia = Bij->A, Aoff, Boff, *Aoffs, *Boffs;
1094   IS          Me, Notme;
1095   PetscInt    M, N, first, last, *notme, i;
1096   PetscBool   lf;
1097   PetscMPIInt size;
1098 
1099   PetscFunctionBegin;
1100   /* Easy test: symmetric diagonal block */
1101   PetscCall(MatIsTranspose(Adia, Bdia, tol, &lf));
1102   PetscCall(MPIU_Allreduce(&lf, f, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)Amat)));
1103   if (!*f) PetscFunctionReturn(PETSC_SUCCESS);
1104   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
1105   PetscCallMPI(MPI_Comm_size(comm, &size));
1106   if (size == 1) PetscFunctionReturn(PETSC_SUCCESS);
1107 
1108   /* Hard test: off-diagonal block. This takes a MatCreateSubMatrix. */
1109   PetscCall(MatGetSize(Amat, &M, &N));
1110   PetscCall(MatGetOwnershipRange(Amat, &first, &last));
1111   PetscCall(PetscMalloc1(N - last + first, &notme));
1112   for (i = 0; i < first; i++) notme[i] = i;
1113   for (i = last; i < M; i++) notme[i - last + first] = i;
1114   PetscCall(ISCreateGeneral(MPI_COMM_SELF, N - last + first, notme, PETSC_COPY_VALUES, &Notme));
1115   PetscCall(ISCreateStride(MPI_COMM_SELF, last - first, first, 1, &Me));
1116   PetscCall(MatCreateSubMatrices(Amat, 1, &Me, &Notme, MAT_INITIAL_MATRIX, &Aoffs));
1117   Aoff = Aoffs[0];
1118   PetscCall(MatCreateSubMatrices(Bmat, 1, &Notme, &Me, MAT_INITIAL_MATRIX, &Boffs));
1119   Boff = Boffs[0];
1120   PetscCall(MatIsTranspose(Aoff, Boff, tol, f));
1121   PetscCall(MatDestroyMatrices(1, &Aoffs));
1122   PetscCall(MatDestroyMatrices(1, &Boffs));
1123   PetscCall(ISDestroy(&Me));
1124   PetscCall(ISDestroy(&Notme));
1125   PetscCall(PetscFree(notme));
1126   PetscFunctionReturn(PETSC_SUCCESS);
1127 }
1128 
1129 static PetscErrorCode MatIsSymmetric_MPIAIJ(Mat A, PetscReal tol, PetscBool *f)
1130 {
1131   PetscFunctionBegin;
1132   PetscCall(MatIsTranspose_MPIAIJ(A, A, tol, f));
1133   PetscFunctionReturn(PETSC_SUCCESS);
1134 }
1135 
1136 static PetscErrorCode MatMultTransposeAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1137 {
1138   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1139 
1140   PetscFunctionBegin;
1141   /* do nondiagonal part */
1142   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1143   /* do local part */
1144   PetscCall((*a->A->ops->multtransposeadd)(a->A, xx, yy, zz));
1145   /* add partial results together */
1146   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1147   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1148   PetscFunctionReturn(PETSC_SUCCESS);
1149 }
1150 
1151 /*
1152   This only works correctly for square matrices where the subblock A->A is the
1153    diagonal block
1154 */
1155 static PetscErrorCode MatGetDiagonal_MPIAIJ(Mat A, Vec v)
1156 {
1157   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1158 
1159   PetscFunctionBegin;
1160   PetscCheck(A->rmap->N == A->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Supports only square matrix where A->A is diag block");
1161   PetscCheck(A->rmap->rstart == A->cmap->rstart && A->rmap->rend == A->cmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "row partition must equal col partition");
1162   PetscCall(MatGetDiagonal(a->A, v));
1163   PetscFunctionReturn(PETSC_SUCCESS);
1164 }
1165 
1166 static PetscErrorCode MatScale_MPIAIJ(Mat A, PetscScalar aa)
1167 {
1168   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1169 
1170   PetscFunctionBegin;
1171   PetscCall(MatScale(a->A, aa));
1172   PetscCall(MatScale(a->B, aa));
1173   PetscFunctionReturn(PETSC_SUCCESS);
1174 }
1175 
1176 static PetscErrorCode MatView_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
1177 {
1178   Mat_MPIAIJ        *aij    = (Mat_MPIAIJ *)mat->data;
1179   Mat_SeqAIJ        *A      = (Mat_SeqAIJ *)aij->A->data;
1180   Mat_SeqAIJ        *B      = (Mat_SeqAIJ *)aij->B->data;
1181   const PetscInt    *garray = aij->garray;
1182   const PetscScalar *aa, *ba;
1183   PetscInt           header[4], M, N, m, rs, cs, cnt, i, ja, jb;
1184   PetscInt64         nz, hnz;
1185   PetscInt          *rowlens;
1186   PetscInt          *colidxs;
1187   PetscScalar       *matvals;
1188   PetscMPIInt        rank;
1189 
1190   PetscFunctionBegin;
1191   PetscCall(PetscViewerSetUp(viewer));
1192 
1193   M  = mat->rmap->N;
1194   N  = mat->cmap->N;
1195   m  = mat->rmap->n;
1196   rs = mat->rmap->rstart;
1197   cs = mat->cmap->rstart;
1198   nz = A->nz + B->nz;
1199 
1200   /* write matrix header */
1201   header[0] = MAT_FILE_CLASSID;
1202   header[1] = M;
1203   header[2] = N;
1204   PetscCallMPI(MPI_Reduce(&nz, &hnz, 1, MPIU_INT64, MPI_SUM, 0, PetscObjectComm((PetscObject)mat)));
1205   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1206   if (rank == 0) {
1207     if (hnz > PETSC_MAX_INT) header[3] = PETSC_MAX_INT;
1208     else header[3] = (PetscInt)hnz;
1209   }
1210   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));
1211 
1212   /* fill in and store row lengths  */
1213   PetscCall(PetscMalloc1(m, &rowlens));
1214   for (i = 0; i < m; i++) rowlens[i] = A->i[i + 1] - A->i[i] + B->i[i + 1] - B->i[i];
1215   PetscCall(PetscViewerBinaryWriteAll(viewer, rowlens, m, rs, M, PETSC_INT));
1216   PetscCall(PetscFree(rowlens));
1217 
1218   /* fill in and store column indices */
1219   PetscCall(PetscMalloc1(nz, &colidxs));
1220   for (cnt = 0, i = 0; i < m; i++) {
1221     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1222       if (garray[B->j[jb]] > cs) break;
1223       colidxs[cnt++] = garray[B->j[jb]];
1224     }
1225     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) colidxs[cnt++] = A->j[ja] + cs;
1226     for (; jb < B->i[i + 1]; jb++) colidxs[cnt++] = garray[B->j[jb]];
1227   }
1228   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1229   PetscCall(PetscViewerBinaryWriteAll(viewer, colidxs, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
1230   PetscCall(PetscFree(colidxs));
1231 
1232   /* fill in and store nonzero values */
1233   PetscCall(MatSeqAIJGetArrayRead(aij->A, &aa));
1234   PetscCall(MatSeqAIJGetArrayRead(aij->B, &ba));
1235   PetscCall(PetscMalloc1(nz, &matvals));
1236   for (cnt = 0, i = 0; i < m; i++) {
1237     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1238       if (garray[B->j[jb]] > cs) break;
1239       matvals[cnt++] = ba[jb];
1240     }
1241     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) matvals[cnt++] = aa[ja];
1242     for (; jb < B->i[i + 1]; jb++) matvals[cnt++] = ba[jb];
1243   }
1244   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &aa));
1245   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &ba));
1246   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1247   PetscCall(PetscViewerBinaryWriteAll(viewer, matvals, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
1248   PetscCall(PetscFree(matvals));
1249 
1250   /* write block size option to the viewer's .info file */
1251   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1252   PetscFunctionReturn(PETSC_SUCCESS);
1253 }
1254 
1255 #include <petscdraw.h>
1256 static PetscErrorCode MatView_MPIAIJ_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
1257 {
1258   Mat_MPIAIJ       *aij  = (Mat_MPIAIJ *)mat->data;
1259   PetscMPIInt       rank = aij->rank, size = aij->size;
1260   PetscBool         isdraw, iascii, isbinary;
1261   PetscViewer       sviewer;
1262   PetscViewerFormat format;
1263 
1264   PetscFunctionBegin;
1265   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1266   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1267   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1268   if (iascii) {
1269     PetscCall(PetscViewerGetFormat(viewer, &format));
1270     if (format == PETSC_VIEWER_LOAD_BALANCE) {
1271       PetscInt i, nmax = 0, nmin = PETSC_MAX_INT, navg = 0, *nz, nzlocal = ((Mat_SeqAIJ *)(aij->A->data))->nz + ((Mat_SeqAIJ *)(aij->B->data))->nz;
1272       PetscCall(PetscMalloc1(size, &nz));
1273       PetscCallMPI(MPI_Allgather(&nzlocal, 1, MPIU_INT, nz, 1, MPIU_INT, PetscObjectComm((PetscObject)mat)));
1274       for (i = 0; i < (PetscInt)size; i++) {
1275         nmax = PetscMax(nmax, nz[i]);
1276         nmin = PetscMin(nmin, nz[i]);
1277         navg += nz[i];
1278       }
1279       PetscCall(PetscFree(nz));
1280       navg = navg / size;
1281       PetscCall(PetscViewerASCIIPrintf(viewer, "Load Balance - Nonzeros: Min %" PetscInt_FMT "  avg %" PetscInt_FMT "  max %" PetscInt_FMT "\n", nmin, navg, nmax));
1282       PetscFunctionReturn(PETSC_SUCCESS);
1283     }
1284     PetscCall(PetscViewerGetFormat(viewer, &format));
1285     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1286       MatInfo   info;
1287       PetscInt *inodes = NULL;
1288 
1289       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1290       PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
1291       PetscCall(MatInodeGetInodeSizes(aij->A, NULL, &inodes, NULL));
1292       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
1293       if (!inodes) {
1294         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, not using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
1295                                                      (double)info.memory));
1296       } else {
1297         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
1298                                                      (double)info.memory));
1299       }
1300       PetscCall(MatGetInfo(aij->A, MAT_LOCAL, &info));
1301       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] on-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1302       PetscCall(MatGetInfo(aij->B, MAT_LOCAL, &info));
1303       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] off-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1304       PetscCall(PetscViewerFlush(viewer));
1305       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
1306       PetscCall(PetscViewerASCIIPrintf(viewer, "Information on VecScatter used in matrix-vector product: \n"));
1307       PetscCall(VecScatterView(aij->Mvctx, viewer));
1308       PetscFunctionReturn(PETSC_SUCCESS);
1309     } else if (format == PETSC_VIEWER_ASCII_INFO) {
1310       PetscInt inodecount, inodelimit, *inodes;
1311       PetscCall(MatInodeGetInodeSizes(aij->A, &inodecount, &inodes, &inodelimit));
1312       if (inodes) {
1313         PetscCall(PetscViewerASCIIPrintf(viewer, "using I-node (on process 0) routines: found %" PetscInt_FMT " nodes, limit used is %" PetscInt_FMT "\n", inodecount, inodelimit));
1314       } else {
1315         PetscCall(PetscViewerASCIIPrintf(viewer, "not using I-node (on process 0) routines\n"));
1316       }
1317       PetscFunctionReturn(PETSC_SUCCESS);
1318     } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1319       PetscFunctionReturn(PETSC_SUCCESS);
1320     }
1321   } else if (isbinary) {
1322     if (size == 1) {
1323       PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1324       PetscCall(MatView(aij->A, viewer));
1325     } else {
1326       PetscCall(MatView_MPIAIJ_Binary(mat, viewer));
1327     }
1328     PetscFunctionReturn(PETSC_SUCCESS);
1329   } else if (iascii && size == 1) {
1330     PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1331     PetscCall(MatView(aij->A, viewer));
1332     PetscFunctionReturn(PETSC_SUCCESS);
1333   } else if (isdraw) {
1334     PetscDraw draw;
1335     PetscBool isnull;
1336     PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1337     PetscCall(PetscDrawIsNull(draw, &isnull));
1338     if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1339   }
1340 
1341   { /* assemble the entire matrix onto first processor */
1342     Mat A = NULL, Av;
1343     IS  isrow, iscol;
1344 
1345     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->rmap->N : 0, 0, 1, &isrow));
1346     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->cmap->N : 0, 0, 1, &iscol));
1347     PetscCall(MatCreateSubMatrix(mat, isrow, iscol, MAT_INITIAL_MATRIX, &A));
1348     PetscCall(MatMPIAIJGetSeqAIJ(A, &Av, NULL, NULL));
1349     /*  The commented code uses MatCreateSubMatrices instead */
1350     /*
1351     Mat *AA, A = NULL, Av;
1352     IS  isrow,iscol;
1353 
1354     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->rmap->N : 0,0,1,&isrow));
1355     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->cmap->N : 0,0,1,&iscol));
1356     PetscCall(MatCreateSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&AA));
1357     if (rank == 0) {
1358        PetscCall(PetscObjectReference((PetscObject)AA[0]));
1359        A    = AA[0];
1360        Av   = AA[0];
1361     }
1362     PetscCall(MatDestroySubMatrices(1,&AA));
1363 */
1364     PetscCall(ISDestroy(&iscol));
1365     PetscCall(ISDestroy(&isrow));
1366     /*
1367        Everyone has to call to draw the matrix since the graphics waits are
1368        synchronized across all processors that share the PetscDraw object
1369     */
1370     PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1371     if (rank == 0) {
1372       if (((PetscObject)mat)->name) PetscCall(PetscObjectSetName((PetscObject)Av, ((PetscObject)mat)->name));
1373       PetscCall(MatView_SeqAIJ(Av, sviewer));
1374     }
1375     PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1376     PetscCall(PetscViewerFlush(viewer));
1377     PetscCall(MatDestroy(&A));
1378   }
1379   PetscFunctionReturn(PETSC_SUCCESS);
1380 }
1381 
1382 PetscErrorCode MatView_MPIAIJ(Mat mat, PetscViewer viewer)
1383 {
1384   PetscBool iascii, isdraw, issocket, isbinary;
1385 
1386   PetscFunctionBegin;
1387   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1388   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1389   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1390   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
1391   if (iascii || isdraw || isbinary || issocket) PetscCall(MatView_MPIAIJ_ASCIIorDraworSocket(mat, viewer));
1392   PetscFunctionReturn(PETSC_SUCCESS);
1393 }
1394 
1395 static PetscErrorCode MatSOR_MPIAIJ(Mat matin, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1396 {
1397   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)matin->data;
1398   Vec         bb1 = NULL;
1399   PetscBool   hasop;
1400 
1401   PetscFunctionBegin;
1402   if (flag == SOR_APPLY_UPPER) {
1403     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1404     PetscFunctionReturn(PETSC_SUCCESS);
1405   }
1406 
1407   if (its > 1 || ~flag & SOR_ZERO_INITIAL_GUESS || flag & SOR_EISENSTAT) PetscCall(VecDuplicate(bb, &bb1));
1408 
1409   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP) {
1410     if (flag & SOR_ZERO_INITIAL_GUESS) {
1411       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1412       its--;
1413     }
1414 
1415     while (its--) {
1416       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1417       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1418 
1419       /* update rhs: bb1 = bb - B*x */
1420       PetscCall(VecScale(mat->lvec, -1.0));
1421       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1422 
1423       /* local sweep */
1424       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_SYMMETRIC_SWEEP, fshift, lits, 1, xx));
1425     }
1426   } else if (flag & SOR_LOCAL_FORWARD_SWEEP) {
1427     if (flag & SOR_ZERO_INITIAL_GUESS) {
1428       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1429       its--;
1430     }
1431     while (its--) {
1432       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1433       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1434 
1435       /* update rhs: bb1 = bb - B*x */
1436       PetscCall(VecScale(mat->lvec, -1.0));
1437       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1438 
1439       /* local sweep */
1440       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_FORWARD_SWEEP, fshift, lits, 1, xx));
1441     }
1442   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP) {
1443     if (flag & SOR_ZERO_INITIAL_GUESS) {
1444       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1445       its--;
1446     }
1447     while (its--) {
1448       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1449       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1450 
1451       /* update rhs: bb1 = bb - B*x */
1452       PetscCall(VecScale(mat->lvec, -1.0));
1453       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1454 
1455       /* local sweep */
1456       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_BACKWARD_SWEEP, fshift, lits, 1, xx));
1457     }
1458   } else if (flag & SOR_EISENSTAT) {
1459     Vec xx1;
1460 
1461     PetscCall(VecDuplicate(bb, &xx1));
1462     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_BACKWARD_SWEEP), fshift, lits, 1, xx));
1463 
1464     PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1465     PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1466     if (!mat->diag) {
1467       PetscCall(MatCreateVecs(matin, &mat->diag, NULL));
1468       PetscCall(MatGetDiagonal(matin, mat->diag));
1469     }
1470     PetscCall(MatHasOperation(matin, MATOP_MULT_DIAGONAL_BLOCK, &hasop));
1471     if (hasop) {
1472       PetscCall(MatMultDiagonalBlock(matin, xx, bb1));
1473     } else {
1474       PetscCall(VecPointwiseMult(bb1, mat->diag, xx));
1475     }
1476     PetscCall(VecAYPX(bb1, (omega - 2.0) / omega, bb));
1477 
1478     PetscCall(MatMultAdd(mat->B, mat->lvec, bb1, bb1));
1479 
1480     /* local sweep */
1481     PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_FORWARD_SWEEP), fshift, lits, 1, xx1));
1482     PetscCall(VecAXPY(xx, 1.0, xx1));
1483     PetscCall(VecDestroy(&xx1));
1484   } else SETERRQ(PetscObjectComm((PetscObject)matin), PETSC_ERR_SUP, "Parallel SOR not supported");
1485 
1486   PetscCall(VecDestroy(&bb1));
1487 
1488   matin->factorerrortype = mat->A->factorerrortype;
1489   PetscFunctionReturn(PETSC_SUCCESS);
1490 }
1491 
1492 static PetscErrorCode MatPermute_MPIAIJ(Mat A, IS rowp, IS colp, Mat *B)
1493 {
1494   Mat             aA, aB, Aperm;
1495   const PetscInt *rwant, *cwant, *gcols, *ai, *bi, *aj, *bj;
1496   PetscScalar    *aa, *ba;
1497   PetscInt        i, j, m, n, ng, anz, bnz, *dnnz, *onnz, *tdnnz, *tonnz, *rdest, *cdest, *work, *gcdest;
1498   PetscSF         rowsf, sf;
1499   IS              parcolp = NULL;
1500   PetscBool       done;
1501 
1502   PetscFunctionBegin;
1503   PetscCall(MatGetLocalSize(A, &m, &n));
1504   PetscCall(ISGetIndices(rowp, &rwant));
1505   PetscCall(ISGetIndices(colp, &cwant));
1506   PetscCall(PetscMalloc3(PetscMax(m, n), &work, m, &rdest, n, &cdest));
1507 
1508   /* Invert row permutation to find out where my rows should go */
1509   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &rowsf));
1510   PetscCall(PetscSFSetGraphLayout(rowsf, A->rmap, A->rmap->n, NULL, PETSC_OWN_POINTER, rwant));
1511   PetscCall(PetscSFSetFromOptions(rowsf));
1512   for (i = 0; i < m; i++) work[i] = A->rmap->rstart + i;
1513   PetscCall(PetscSFReduceBegin(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1514   PetscCall(PetscSFReduceEnd(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1515 
1516   /* Invert column permutation to find out where my columns should go */
1517   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1518   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, A->cmap->n, NULL, PETSC_OWN_POINTER, cwant));
1519   PetscCall(PetscSFSetFromOptions(sf));
1520   for (i = 0; i < n; i++) work[i] = A->cmap->rstart + i;
1521   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1522   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1523   PetscCall(PetscSFDestroy(&sf));
1524 
1525   PetscCall(ISRestoreIndices(rowp, &rwant));
1526   PetscCall(ISRestoreIndices(colp, &cwant));
1527   PetscCall(MatMPIAIJGetSeqAIJ(A, &aA, &aB, &gcols));
1528 
1529   /* Find out where my gcols should go */
1530   PetscCall(MatGetSize(aB, NULL, &ng));
1531   PetscCall(PetscMalloc1(ng, &gcdest));
1532   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1533   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, ng, NULL, PETSC_OWN_POINTER, gcols));
1534   PetscCall(PetscSFSetFromOptions(sf));
1535   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1536   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1537   PetscCall(PetscSFDestroy(&sf));
1538 
1539   PetscCall(PetscCalloc4(m, &dnnz, m, &onnz, m, &tdnnz, m, &tonnz));
1540   PetscCall(MatGetRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1541   PetscCall(MatGetRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1542   for (i = 0; i < m; i++) {
1543     PetscInt    row = rdest[i];
1544     PetscMPIInt rowner;
1545     PetscCall(PetscLayoutFindOwner(A->rmap, row, &rowner));
1546     for (j = ai[i]; j < ai[i + 1]; j++) {
1547       PetscInt    col = cdest[aj[j]];
1548       PetscMPIInt cowner;
1549       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner)); /* Could build an index for the columns to eliminate this search */
1550       if (rowner == cowner) dnnz[i]++;
1551       else onnz[i]++;
1552     }
1553     for (j = bi[i]; j < bi[i + 1]; j++) {
1554       PetscInt    col = gcdest[bj[j]];
1555       PetscMPIInt cowner;
1556       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner));
1557       if (rowner == cowner) dnnz[i]++;
1558       else onnz[i]++;
1559     }
1560   }
1561   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1562   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1563   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1564   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1565   PetscCall(PetscSFDestroy(&rowsf));
1566 
1567   PetscCall(MatCreateAIJ(PetscObjectComm((PetscObject)A), A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N, 0, tdnnz, 0, tonnz, &Aperm));
1568   PetscCall(MatSeqAIJGetArray(aA, &aa));
1569   PetscCall(MatSeqAIJGetArray(aB, &ba));
1570   for (i = 0; i < m; i++) {
1571     PetscInt *acols = dnnz, *bcols = onnz; /* Repurpose now-unneeded arrays */
1572     PetscInt  j0, rowlen;
1573     rowlen = ai[i + 1] - ai[i];
1574     for (j0 = j = 0; j < rowlen; j0 = j) { /* rowlen could be larger than number of rows m, so sum in batches */
1575       for (; j < PetscMin(rowlen, j0 + m); j++) acols[j - j0] = cdest[aj[ai[i] + j]];
1576       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, acols, aa + ai[i] + j0, INSERT_VALUES));
1577     }
1578     rowlen = bi[i + 1] - bi[i];
1579     for (j0 = j = 0; j < rowlen; j0 = j) {
1580       for (; j < PetscMin(rowlen, j0 + m); j++) bcols[j - j0] = gcdest[bj[bi[i] + j]];
1581       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, bcols, ba + bi[i] + j0, INSERT_VALUES));
1582     }
1583   }
1584   PetscCall(MatAssemblyBegin(Aperm, MAT_FINAL_ASSEMBLY));
1585   PetscCall(MatAssemblyEnd(Aperm, MAT_FINAL_ASSEMBLY));
1586   PetscCall(MatRestoreRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1587   PetscCall(MatRestoreRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1588   PetscCall(MatSeqAIJRestoreArray(aA, &aa));
1589   PetscCall(MatSeqAIJRestoreArray(aB, &ba));
1590   PetscCall(PetscFree4(dnnz, onnz, tdnnz, tonnz));
1591   PetscCall(PetscFree3(work, rdest, cdest));
1592   PetscCall(PetscFree(gcdest));
1593   if (parcolp) PetscCall(ISDestroy(&colp));
1594   *B = Aperm;
1595   PetscFunctionReturn(PETSC_SUCCESS);
1596 }
1597 
1598 static PetscErrorCode MatGetGhosts_MPIAIJ(Mat mat, PetscInt *nghosts, const PetscInt *ghosts[])
1599 {
1600   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1601 
1602   PetscFunctionBegin;
1603   PetscCall(MatGetSize(aij->B, NULL, nghosts));
1604   if (ghosts) *ghosts = aij->garray;
1605   PetscFunctionReturn(PETSC_SUCCESS);
1606 }
1607 
1608 static PetscErrorCode MatGetInfo_MPIAIJ(Mat matin, MatInfoType flag, MatInfo *info)
1609 {
1610   Mat_MPIAIJ    *mat = (Mat_MPIAIJ *)matin->data;
1611   Mat            A = mat->A, B = mat->B;
1612   PetscLogDouble isend[5], irecv[5];
1613 
1614   PetscFunctionBegin;
1615   info->block_size = 1.0;
1616   PetscCall(MatGetInfo(A, MAT_LOCAL, info));
1617 
1618   isend[0] = info->nz_used;
1619   isend[1] = info->nz_allocated;
1620   isend[2] = info->nz_unneeded;
1621   isend[3] = info->memory;
1622   isend[4] = info->mallocs;
1623 
1624   PetscCall(MatGetInfo(B, MAT_LOCAL, info));
1625 
1626   isend[0] += info->nz_used;
1627   isend[1] += info->nz_allocated;
1628   isend[2] += info->nz_unneeded;
1629   isend[3] += info->memory;
1630   isend[4] += info->mallocs;
1631   if (flag == MAT_LOCAL) {
1632     info->nz_used      = isend[0];
1633     info->nz_allocated = isend[1];
1634     info->nz_unneeded  = isend[2];
1635     info->memory       = isend[3];
1636     info->mallocs      = isend[4];
1637   } else if (flag == MAT_GLOBAL_MAX) {
1638     PetscCall(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)matin)));
1639 
1640     info->nz_used      = irecv[0];
1641     info->nz_allocated = irecv[1];
1642     info->nz_unneeded  = irecv[2];
1643     info->memory       = irecv[3];
1644     info->mallocs      = irecv[4];
1645   } else if (flag == MAT_GLOBAL_SUM) {
1646     PetscCall(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)matin)));
1647 
1648     info->nz_used      = irecv[0];
1649     info->nz_allocated = irecv[1];
1650     info->nz_unneeded  = irecv[2];
1651     info->memory       = irecv[3];
1652     info->mallocs      = irecv[4];
1653   }
1654   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1655   info->fill_ratio_needed = 0;
1656   info->factor_mallocs    = 0;
1657   PetscFunctionReturn(PETSC_SUCCESS);
1658 }
1659 
1660 PetscErrorCode MatSetOption_MPIAIJ(Mat A, MatOption op, PetscBool flg)
1661 {
1662   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1663 
1664   PetscFunctionBegin;
1665   switch (op) {
1666   case MAT_NEW_NONZERO_LOCATIONS:
1667   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1668   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1669   case MAT_KEEP_NONZERO_PATTERN:
1670   case MAT_NEW_NONZERO_LOCATION_ERR:
1671   case MAT_USE_INODES:
1672   case MAT_IGNORE_ZERO_ENTRIES:
1673   case MAT_FORM_EXPLICIT_TRANSPOSE:
1674     MatCheckPreallocated(A, 1);
1675     PetscCall(MatSetOption(a->A, op, flg));
1676     PetscCall(MatSetOption(a->B, op, flg));
1677     break;
1678   case MAT_ROW_ORIENTED:
1679     MatCheckPreallocated(A, 1);
1680     a->roworiented = flg;
1681 
1682     PetscCall(MatSetOption(a->A, op, flg));
1683     PetscCall(MatSetOption(a->B, op, flg));
1684     break;
1685   case MAT_FORCE_DIAGONAL_ENTRIES:
1686   case MAT_SORTED_FULL:
1687     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
1688     break;
1689   case MAT_IGNORE_OFF_PROC_ENTRIES:
1690     a->donotstash = flg;
1691     break;
1692   /* Symmetry flags are handled directly by MatSetOption() and they don't affect preallocation */
1693   case MAT_SPD:
1694   case MAT_SYMMETRIC:
1695   case MAT_STRUCTURALLY_SYMMETRIC:
1696   case MAT_HERMITIAN:
1697   case MAT_SYMMETRY_ETERNAL:
1698   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1699   case MAT_SPD_ETERNAL:
1700     /* if the diagonal matrix is square it inherits some of the properties above */
1701     break;
1702   case MAT_SUBMAT_SINGLEIS:
1703     A->submat_singleis = flg;
1704     break;
1705   case MAT_STRUCTURE_ONLY:
1706     /* The option is handled directly by MatSetOption() */
1707     break;
1708   default:
1709     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
1710   }
1711   PetscFunctionReturn(PETSC_SUCCESS);
1712 }
1713 
1714 PetscErrorCode MatGetRow_MPIAIJ(Mat matin, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1715 {
1716   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)matin->data;
1717   PetscScalar *vworkA, *vworkB, **pvA, **pvB, *v_p;
1718   PetscInt     i, *cworkA, *cworkB, **pcA, **pcB, cstart = matin->cmap->rstart;
1719   PetscInt     nztot, nzA, nzB, lrow, rstart = matin->rmap->rstart, rend = matin->rmap->rend;
1720   PetscInt    *cmap, *idx_p;
1721 
1722   PetscFunctionBegin;
1723   PetscCheck(!mat->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Already active");
1724   mat->getrowactive = PETSC_TRUE;
1725 
1726   if (!mat->rowvalues && (idx || v)) {
1727     /*
1728         allocate enough space to hold information from the longest row.
1729     */
1730     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *)mat->A->data, *Ba = (Mat_SeqAIJ *)mat->B->data;
1731     PetscInt    max = 1, tmp;
1732     for (i = 0; i < matin->rmap->n; i++) {
1733       tmp = Aa->i[i + 1] - Aa->i[i] + Ba->i[i + 1] - Ba->i[i];
1734       if (max < tmp) max = tmp;
1735     }
1736     PetscCall(PetscMalloc2(max, &mat->rowvalues, max, &mat->rowindices));
1737   }
1738 
1739   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Only local rows");
1740   lrow = row - rstart;
1741 
1742   pvA = &vworkA;
1743   pcA = &cworkA;
1744   pvB = &vworkB;
1745   pcB = &cworkB;
1746   if (!v) {
1747     pvA = NULL;
1748     pvB = NULL;
1749   }
1750   if (!idx) {
1751     pcA = NULL;
1752     if (!v) pcB = NULL;
1753   }
1754   PetscCall((*mat->A->ops->getrow)(mat->A, lrow, &nzA, pcA, pvA));
1755   PetscCall((*mat->B->ops->getrow)(mat->B, lrow, &nzB, pcB, pvB));
1756   nztot = nzA + nzB;
1757 
1758   cmap = mat->garray;
1759   if (v || idx) {
1760     if (nztot) {
1761       /* Sort by increasing column numbers, assuming A and B already sorted */
1762       PetscInt imark = -1;
1763       if (v) {
1764         *v = v_p = mat->rowvalues;
1765         for (i = 0; i < nzB; i++) {
1766           if (cmap[cworkB[i]] < cstart) v_p[i] = vworkB[i];
1767           else break;
1768         }
1769         imark = i;
1770         for (i = 0; i < nzA; i++) v_p[imark + i] = vworkA[i];
1771         for (i = imark; i < nzB; i++) v_p[nzA + i] = vworkB[i];
1772       }
1773       if (idx) {
1774         *idx = idx_p = mat->rowindices;
1775         if (imark > -1) {
1776           for (i = 0; i < imark; i++) idx_p[i] = cmap[cworkB[i]];
1777         } else {
1778           for (i = 0; i < nzB; i++) {
1779             if (cmap[cworkB[i]] < cstart) idx_p[i] = cmap[cworkB[i]];
1780             else break;
1781           }
1782           imark = i;
1783         }
1784         for (i = 0; i < nzA; i++) idx_p[imark + i] = cstart + cworkA[i];
1785         for (i = imark; i < nzB; i++) idx_p[nzA + i] = cmap[cworkB[i]];
1786       }
1787     } else {
1788       if (idx) *idx = NULL;
1789       if (v) *v = NULL;
1790     }
1791   }
1792   *nz = nztot;
1793   PetscCall((*mat->A->ops->restorerow)(mat->A, lrow, &nzA, pcA, pvA));
1794   PetscCall((*mat->B->ops->restorerow)(mat->B, lrow, &nzB, pcB, pvB));
1795   PetscFunctionReturn(PETSC_SUCCESS);
1796 }
1797 
1798 PetscErrorCode MatRestoreRow_MPIAIJ(Mat mat, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1799 {
1800   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1801 
1802   PetscFunctionBegin;
1803   PetscCheck(aij->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "MatGetRow() must be called first");
1804   aij->getrowactive = PETSC_FALSE;
1805   PetscFunctionReturn(PETSC_SUCCESS);
1806 }
1807 
1808 static PetscErrorCode MatNorm_MPIAIJ(Mat mat, NormType type, PetscReal *norm)
1809 {
1810   Mat_MPIAIJ      *aij  = (Mat_MPIAIJ *)mat->data;
1811   Mat_SeqAIJ      *amat = (Mat_SeqAIJ *)aij->A->data, *bmat = (Mat_SeqAIJ *)aij->B->data;
1812   PetscInt         i, j, cstart = mat->cmap->rstart;
1813   PetscReal        sum = 0.0;
1814   const MatScalar *v, *amata, *bmata;
1815 
1816   PetscFunctionBegin;
1817   if (aij->size == 1) {
1818     PetscCall(MatNorm(aij->A, type, norm));
1819   } else {
1820     PetscCall(MatSeqAIJGetArrayRead(aij->A, &amata));
1821     PetscCall(MatSeqAIJGetArrayRead(aij->B, &bmata));
1822     if (type == NORM_FROBENIUS) {
1823       v = amata;
1824       for (i = 0; i < amat->nz; i++) {
1825         sum += PetscRealPart(PetscConj(*v) * (*v));
1826         v++;
1827       }
1828       v = bmata;
1829       for (i = 0; i < bmat->nz; i++) {
1830         sum += PetscRealPart(PetscConj(*v) * (*v));
1831         v++;
1832       }
1833       PetscCall(MPIU_Allreduce(&sum, norm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1834       *norm = PetscSqrtReal(*norm);
1835       PetscCall(PetscLogFlops(2.0 * amat->nz + 2.0 * bmat->nz));
1836     } else if (type == NORM_1) { /* max column norm */
1837       PetscReal *tmp, *tmp2;
1838       PetscInt  *jj, *garray = aij->garray;
1839       PetscCall(PetscCalloc1(mat->cmap->N + 1, &tmp));
1840       PetscCall(PetscMalloc1(mat->cmap->N + 1, &tmp2));
1841       *norm = 0.0;
1842       v     = amata;
1843       jj    = amat->j;
1844       for (j = 0; j < amat->nz; j++) {
1845         tmp[cstart + *jj++] += PetscAbsScalar(*v);
1846         v++;
1847       }
1848       v  = bmata;
1849       jj = bmat->j;
1850       for (j = 0; j < bmat->nz; j++) {
1851         tmp[garray[*jj++]] += PetscAbsScalar(*v);
1852         v++;
1853       }
1854       PetscCall(MPIU_Allreduce(tmp, tmp2, mat->cmap->N, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1855       for (j = 0; j < mat->cmap->N; j++) {
1856         if (tmp2[j] > *norm) *norm = tmp2[j];
1857       }
1858       PetscCall(PetscFree(tmp));
1859       PetscCall(PetscFree(tmp2));
1860       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1861     } else if (type == NORM_INFINITY) { /* max row norm */
1862       PetscReal ntemp = 0.0;
1863       for (j = 0; j < aij->A->rmap->n; j++) {
1864         v   = amata + amat->i[j];
1865         sum = 0.0;
1866         for (i = 0; i < amat->i[j + 1] - amat->i[j]; i++) {
1867           sum += PetscAbsScalar(*v);
1868           v++;
1869         }
1870         v = bmata + bmat->i[j];
1871         for (i = 0; i < bmat->i[j + 1] - bmat->i[j]; i++) {
1872           sum += PetscAbsScalar(*v);
1873           v++;
1874         }
1875         if (sum > ntemp) ntemp = sum;
1876       }
1877       PetscCall(MPIU_Allreduce(&ntemp, norm, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)mat)));
1878       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1879     } else SETERRQ(PetscObjectComm((PetscObject)mat), PETSC_ERR_SUP, "No support for two norm");
1880     PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &amata));
1881     PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &bmata));
1882   }
1883   PetscFunctionReturn(PETSC_SUCCESS);
1884 }
1885 
1886 static PetscErrorCode MatTranspose_MPIAIJ(Mat A, MatReuse reuse, Mat *matout)
1887 {
1888   Mat_MPIAIJ      *a    = (Mat_MPIAIJ *)A->data, *b;
1889   Mat_SeqAIJ      *Aloc = (Mat_SeqAIJ *)a->A->data, *Bloc = (Mat_SeqAIJ *)a->B->data, *sub_B_diag;
1890   PetscInt         M = A->rmap->N, N = A->cmap->N, ma, na, mb, nb, row, *cols, *cols_tmp, *B_diag_ilen, i, ncol, A_diag_ncol;
1891   const PetscInt  *ai, *aj, *bi, *bj, *B_diag_i;
1892   Mat              B, A_diag, *B_diag;
1893   const MatScalar *pbv, *bv;
1894 
1895   PetscFunctionBegin;
1896   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *matout));
1897   ma = A->rmap->n;
1898   na = A->cmap->n;
1899   mb = a->B->rmap->n;
1900   nb = a->B->cmap->n;
1901   ai = Aloc->i;
1902   aj = Aloc->j;
1903   bi = Bloc->i;
1904   bj = Bloc->j;
1905   if (reuse == MAT_INITIAL_MATRIX || *matout == A) {
1906     PetscInt            *d_nnz, *g_nnz, *o_nnz;
1907     PetscSFNode         *oloc;
1908     PETSC_UNUSED PetscSF sf;
1909 
1910     PetscCall(PetscMalloc4(na, &d_nnz, na, &o_nnz, nb, &g_nnz, nb, &oloc));
1911     /* compute d_nnz for preallocation */
1912     PetscCall(PetscArrayzero(d_nnz, na));
1913     for (i = 0; i < ai[ma]; i++) d_nnz[aj[i]]++;
1914     /* compute local off-diagonal contributions */
1915     PetscCall(PetscArrayzero(g_nnz, nb));
1916     for (i = 0; i < bi[ma]; i++) g_nnz[bj[i]]++;
1917     /* map those to global */
1918     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1919     PetscCall(PetscSFSetGraphLayout(sf, A->cmap, nb, NULL, PETSC_USE_POINTER, a->garray));
1920     PetscCall(PetscSFSetFromOptions(sf));
1921     PetscCall(PetscArrayzero(o_nnz, na));
1922     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1923     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1924     PetscCall(PetscSFDestroy(&sf));
1925 
1926     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
1927     PetscCall(MatSetSizes(B, A->cmap->n, A->rmap->n, N, M));
1928     PetscCall(MatSetBlockSizes(B, PetscAbs(A->cmap->bs), PetscAbs(A->rmap->bs)));
1929     PetscCall(MatSetType(B, ((PetscObject)A)->type_name));
1930     PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
1931     PetscCall(PetscFree4(d_nnz, o_nnz, g_nnz, oloc));
1932   } else {
1933     B = *matout;
1934     PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
1935   }
1936 
1937   b           = (Mat_MPIAIJ *)B->data;
1938   A_diag      = a->A;
1939   B_diag      = &b->A;
1940   sub_B_diag  = (Mat_SeqAIJ *)(*B_diag)->data;
1941   A_diag_ncol = A_diag->cmap->N;
1942   B_diag_ilen = sub_B_diag->ilen;
1943   B_diag_i    = sub_B_diag->i;
1944 
1945   /* Set ilen for diagonal of B */
1946   for (i = 0; i < A_diag_ncol; i++) B_diag_ilen[i] = B_diag_i[i + 1] - B_diag_i[i];
1947 
1948   /* Transpose the diagonal part of the matrix. In contrast to the off-diagonal part, this can be done
1949   very quickly (=without using MatSetValues), because all writes are local. */
1950   PetscCall(MatTransposeSetPrecursor(A_diag, *B_diag));
1951   PetscCall(MatTranspose(A_diag, MAT_REUSE_MATRIX, B_diag));
1952 
1953   /* copy over the B part */
1954   PetscCall(PetscMalloc1(bi[mb], &cols));
1955   PetscCall(MatSeqAIJGetArrayRead(a->B, &bv));
1956   pbv = bv;
1957   row = A->rmap->rstart;
1958   for (i = 0; i < bi[mb]; i++) cols[i] = a->garray[bj[i]];
1959   cols_tmp = cols;
1960   for (i = 0; i < mb; i++) {
1961     ncol = bi[i + 1] - bi[i];
1962     PetscCall(MatSetValues(B, ncol, cols_tmp, 1, &row, pbv, INSERT_VALUES));
1963     row++;
1964     if (pbv) pbv += ncol;
1965     if (cols_tmp) cols_tmp += ncol;
1966   }
1967   PetscCall(PetscFree(cols));
1968   PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bv));
1969 
1970   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1971   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1972   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1973     *matout = B;
1974   } else {
1975     PetscCall(MatHeaderMerge(A, &B));
1976   }
1977   PetscFunctionReturn(PETSC_SUCCESS);
1978 }
1979 
1980 static PetscErrorCode MatDiagonalScale_MPIAIJ(Mat mat, Vec ll, Vec rr)
1981 {
1982   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1983   Mat         a = aij->A, b = aij->B;
1984   PetscInt    s1, s2, s3;
1985 
1986   PetscFunctionBegin;
1987   PetscCall(MatGetLocalSize(mat, &s2, &s3));
1988   if (rr) {
1989     PetscCall(VecGetLocalSize(rr, &s1));
1990     PetscCheck(s1 == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "right vector non-conforming local size");
1991     /* Overlap communication with computation. */
1992     PetscCall(VecScatterBegin(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
1993   }
1994   if (ll) {
1995     PetscCall(VecGetLocalSize(ll, &s1));
1996     PetscCheck(s1 == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "left vector non-conforming local size");
1997     PetscUseTypeMethod(b, diagonalscale, ll, NULL);
1998   }
1999   /* scale  the diagonal block */
2000   PetscUseTypeMethod(a, diagonalscale, ll, rr);
2001 
2002   if (rr) {
2003     /* Do a scatter end and then right scale the off-diagonal block */
2004     PetscCall(VecScatterEnd(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
2005     PetscUseTypeMethod(b, diagonalscale, NULL, aij->lvec);
2006   }
2007   PetscFunctionReturn(PETSC_SUCCESS);
2008 }
2009 
2010 static PetscErrorCode MatSetUnfactored_MPIAIJ(Mat A)
2011 {
2012   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2013 
2014   PetscFunctionBegin;
2015   PetscCall(MatSetUnfactored(a->A));
2016   PetscFunctionReturn(PETSC_SUCCESS);
2017 }
2018 
2019 static PetscErrorCode MatEqual_MPIAIJ(Mat A, Mat B, PetscBool *flag)
2020 {
2021   Mat_MPIAIJ *matB = (Mat_MPIAIJ *)B->data, *matA = (Mat_MPIAIJ *)A->data;
2022   Mat         a, b, c, d;
2023   PetscBool   flg;
2024 
2025   PetscFunctionBegin;
2026   a = matA->A;
2027   b = matA->B;
2028   c = matB->A;
2029   d = matB->B;
2030 
2031   PetscCall(MatEqual(a, c, &flg));
2032   if (flg) PetscCall(MatEqual(b, d, &flg));
2033   PetscCall(MPIU_Allreduce(&flg, flag, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
2034   PetscFunctionReturn(PETSC_SUCCESS);
2035 }
2036 
2037 static PetscErrorCode MatCopy_MPIAIJ(Mat A, Mat B, MatStructure str)
2038 {
2039   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2040   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2041 
2042   PetscFunctionBegin;
2043   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
2044   if ((str != SAME_NONZERO_PATTERN) || (A->ops->copy != B->ops->copy)) {
2045     /* because of the column compression in the off-processor part of the matrix a->B,
2046        the number of columns in a->B and b->B may be different, hence we cannot call
2047        the MatCopy() directly on the two parts. If need be, we can provide a more
2048        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
2049        then copying the submatrices */
2050     PetscCall(MatCopy_Basic(A, B, str));
2051   } else {
2052     PetscCall(MatCopy(a->A, b->A, str));
2053     PetscCall(MatCopy(a->B, b->B, str));
2054   }
2055   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2056   PetscFunctionReturn(PETSC_SUCCESS);
2057 }
2058 
2059 /*
2060    Computes the number of nonzeros per row needed for preallocation when X and Y
2061    have different nonzero structure.
2062 */
2063 PetscErrorCode MatAXPYGetPreallocation_MPIX_private(PetscInt m, const PetscInt *xi, const PetscInt *xj, const PetscInt *xltog, const PetscInt *yi, const PetscInt *yj, const PetscInt *yltog, PetscInt *nnz)
2064 {
2065   PetscInt i, j, k, nzx, nzy;
2066 
2067   PetscFunctionBegin;
2068   /* Set the number of nonzeros in the new matrix */
2069   for (i = 0; i < m; i++) {
2070     const PetscInt *xjj = xj + xi[i], *yjj = yj + yi[i];
2071     nzx    = xi[i + 1] - xi[i];
2072     nzy    = yi[i + 1] - yi[i];
2073     nnz[i] = 0;
2074     for (j = 0, k = 0; j < nzx; j++) {                                /* Point in X */
2075       for (; k < nzy && yltog[yjj[k]] < xltog[xjj[j]]; k++) nnz[i]++; /* Catch up to X */
2076       if (k < nzy && yltog[yjj[k]] == xltog[xjj[j]]) k++;             /* Skip duplicate */
2077       nnz[i]++;
2078     }
2079     for (; k < nzy; k++) nnz[i]++;
2080   }
2081   PetscFunctionReturn(PETSC_SUCCESS);
2082 }
2083 
2084 /* This is the same as MatAXPYGetPreallocation_SeqAIJ, except that the local-to-global map is provided */
2085 static PetscErrorCode MatAXPYGetPreallocation_MPIAIJ(Mat Y, const PetscInt *yltog, Mat X, const PetscInt *xltog, PetscInt *nnz)
2086 {
2087   PetscInt    m = Y->rmap->N;
2088   Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data;
2089   Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
2090 
2091   PetscFunctionBegin;
2092   PetscCall(MatAXPYGetPreallocation_MPIX_private(m, x->i, x->j, xltog, y->i, y->j, yltog, nnz));
2093   PetscFunctionReturn(PETSC_SUCCESS);
2094 }
2095 
2096 static PetscErrorCode MatAXPY_MPIAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2097 {
2098   Mat_MPIAIJ *xx = (Mat_MPIAIJ *)X->data, *yy = (Mat_MPIAIJ *)Y->data;
2099 
2100   PetscFunctionBegin;
2101   if (str == SAME_NONZERO_PATTERN) {
2102     PetscCall(MatAXPY(yy->A, a, xx->A, str));
2103     PetscCall(MatAXPY(yy->B, a, xx->B, str));
2104   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2105     PetscCall(MatAXPY_Basic(Y, a, X, str));
2106   } else {
2107     Mat       B;
2108     PetscInt *nnz_d, *nnz_o;
2109 
2110     PetscCall(PetscMalloc1(yy->A->rmap->N, &nnz_d));
2111     PetscCall(PetscMalloc1(yy->B->rmap->N, &nnz_o));
2112     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2113     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2114     PetscCall(MatSetLayouts(B, Y->rmap, Y->cmap));
2115     PetscCall(MatSetType(B, ((PetscObject)Y)->type_name));
2116     PetscCall(MatAXPYGetPreallocation_SeqAIJ(yy->A, xx->A, nnz_d));
2117     PetscCall(MatAXPYGetPreallocation_MPIAIJ(yy->B, yy->garray, xx->B, xx->garray, nnz_o));
2118     PetscCall(MatMPIAIJSetPreallocation(B, 0, nnz_d, 0, nnz_o));
2119     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2120     PetscCall(MatHeaderMerge(Y, &B));
2121     PetscCall(PetscFree(nnz_d));
2122     PetscCall(PetscFree(nnz_o));
2123   }
2124   PetscFunctionReturn(PETSC_SUCCESS);
2125 }
2126 
2127 PETSC_INTERN PetscErrorCode MatConjugate_SeqAIJ(Mat);
2128 
2129 static PetscErrorCode MatConjugate_MPIAIJ(Mat mat)
2130 {
2131   PetscFunctionBegin;
2132   if (PetscDefined(USE_COMPLEX)) {
2133     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2134 
2135     PetscCall(MatConjugate_SeqAIJ(aij->A));
2136     PetscCall(MatConjugate_SeqAIJ(aij->B));
2137   }
2138   PetscFunctionReturn(PETSC_SUCCESS);
2139 }
2140 
2141 static PetscErrorCode MatRealPart_MPIAIJ(Mat A)
2142 {
2143   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2144 
2145   PetscFunctionBegin;
2146   PetscCall(MatRealPart(a->A));
2147   PetscCall(MatRealPart(a->B));
2148   PetscFunctionReturn(PETSC_SUCCESS);
2149 }
2150 
2151 static PetscErrorCode MatImaginaryPart_MPIAIJ(Mat A)
2152 {
2153   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2154 
2155   PetscFunctionBegin;
2156   PetscCall(MatImaginaryPart(a->A));
2157   PetscCall(MatImaginaryPart(a->B));
2158   PetscFunctionReturn(PETSC_SUCCESS);
2159 }
2160 
2161 static PetscErrorCode MatGetRowMaxAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2162 {
2163   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
2164   PetscInt           i, *idxb = NULL, m = A->rmap->n;
2165   PetscScalar       *va, *vv;
2166   Vec                vB, vA;
2167   const PetscScalar *vb;
2168 
2169   PetscFunctionBegin;
2170   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &vA));
2171   PetscCall(MatGetRowMaxAbs(a->A, vA, idx));
2172 
2173   PetscCall(VecGetArrayWrite(vA, &va));
2174   if (idx) {
2175     for (i = 0; i < m; i++) {
2176       if (PetscAbsScalar(va[i])) idx[i] += A->cmap->rstart;
2177     }
2178   }
2179 
2180   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &vB));
2181   PetscCall(PetscMalloc1(m, &idxb));
2182   PetscCall(MatGetRowMaxAbs(a->B, vB, idxb));
2183 
2184   PetscCall(VecGetArrayWrite(v, &vv));
2185   PetscCall(VecGetArrayRead(vB, &vb));
2186   for (i = 0; i < m; i++) {
2187     if (PetscAbsScalar(va[i]) < PetscAbsScalar(vb[i])) {
2188       vv[i] = vb[i];
2189       if (idx) idx[i] = a->garray[idxb[i]];
2190     } else {
2191       vv[i] = va[i];
2192       if (idx && PetscAbsScalar(va[i]) == PetscAbsScalar(vb[i]) && idxb[i] != -1 && idx[i] > a->garray[idxb[i]]) idx[i] = a->garray[idxb[i]];
2193     }
2194   }
2195   PetscCall(VecRestoreArrayWrite(vA, &vv));
2196   PetscCall(VecRestoreArrayWrite(vA, &va));
2197   PetscCall(VecRestoreArrayRead(vB, &vb));
2198   PetscCall(PetscFree(idxb));
2199   PetscCall(VecDestroy(&vA));
2200   PetscCall(VecDestroy(&vB));
2201   PetscFunctionReturn(PETSC_SUCCESS);
2202 }
2203 
2204 static PetscErrorCode MatGetRowMinAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2205 {
2206   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2207   PetscInt           m = A->rmap->n, n = A->cmap->n;
2208   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2209   PetscInt          *cmap = mat->garray;
2210   PetscInt          *diagIdx, *offdiagIdx;
2211   Vec                diagV, offdiagV;
2212   PetscScalar       *a, *diagA, *offdiagA;
2213   const PetscScalar *ba, *bav;
2214   PetscInt           r, j, col, ncols, *bi, *bj;
2215   Mat                B = mat->B;
2216   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2217 
2218   PetscFunctionBegin;
2219   /* When a process holds entire A and other processes have no entry */
2220   if (A->cmap->N == n) {
2221     PetscCall(VecGetArrayWrite(v, &diagA));
2222     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2223     PetscCall(MatGetRowMinAbs(mat->A, diagV, idx));
2224     PetscCall(VecDestroy(&diagV));
2225     PetscCall(VecRestoreArrayWrite(v, &diagA));
2226     PetscFunctionReturn(PETSC_SUCCESS);
2227   } else if (n == 0) {
2228     if (m) {
2229       PetscCall(VecGetArrayWrite(v, &a));
2230       for (r = 0; r < m; r++) {
2231         a[r] = 0.0;
2232         if (idx) idx[r] = -1;
2233       }
2234       PetscCall(VecRestoreArrayWrite(v, &a));
2235     }
2236     PetscFunctionReturn(PETSC_SUCCESS);
2237   }
2238 
2239   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2240   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2241   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2242   PetscCall(MatGetRowMinAbs(mat->A, diagV, diagIdx));
2243 
2244   /* Get offdiagIdx[] for implicit 0.0 */
2245   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2246   ba = bav;
2247   bi = b->i;
2248   bj = b->j;
2249   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2250   for (r = 0; r < m; r++) {
2251     ncols = bi[r + 1] - bi[r];
2252     if (ncols == A->cmap->N - n) { /* Brow is dense */
2253       offdiagA[r]   = *ba;
2254       offdiagIdx[r] = cmap[0];
2255     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2256       offdiagA[r] = 0.0;
2257 
2258       /* Find first hole in the cmap */
2259       for (j = 0; j < ncols; j++) {
2260         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2261         if (col > j && j < cstart) {
2262           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2263           break;
2264         } else if (col > j + n && j >= cstart) {
2265           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2266           break;
2267         }
2268       }
2269       if (j == ncols && ncols < A->cmap->N - n) {
2270         /* a hole is outside compressed Bcols */
2271         if (ncols == 0) {
2272           if (cstart) {
2273             offdiagIdx[r] = 0;
2274           } else offdiagIdx[r] = cend;
2275         } else { /* ncols > 0 */
2276           offdiagIdx[r] = cmap[ncols - 1] + 1;
2277           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2278         }
2279       }
2280     }
2281 
2282     for (j = 0; j < ncols; j++) {
2283       if (PetscAbsScalar(offdiagA[r]) > PetscAbsScalar(*ba)) {
2284         offdiagA[r]   = *ba;
2285         offdiagIdx[r] = cmap[*bj];
2286       }
2287       ba++;
2288       bj++;
2289     }
2290   }
2291 
2292   PetscCall(VecGetArrayWrite(v, &a));
2293   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2294   for (r = 0; r < m; ++r) {
2295     if (PetscAbsScalar(diagA[r]) < PetscAbsScalar(offdiagA[r])) {
2296       a[r] = diagA[r];
2297       if (idx) idx[r] = cstart + diagIdx[r];
2298     } else if (PetscAbsScalar(diagA[r]) == PetscAbsScalar(offdiagA[r])) {
2299       a[r] = diagA[r];
2300       if (idx) {
2301         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2302           idx[r] = cstart + diagIdx[r];
2303         } else idx[r] = offdiagIdx[r];
2304       }
2305     } else {
2306       a[r] = offdiagA[r];
2307       if (idx) idx[r] = offdiagIdx[r];
2308     }
2309   }
2310   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2311   PetscCall(VecRestoreArrayWrite(v, &a));
2312   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2313   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2314   PetscCall(VecDestroy(&diagV));
2315   PetscCall(VecDestroy(&offdiagV));
2316   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2317   PetscFunctionReturn(PETSC_SUCCESS);
2318 }
2319 
2320 static PetscErrorCode MatGetRowMin_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2321 {
2322   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2323   PetscInt           m = A->rmap->n, n = A->cmap->n;
2324   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2325   PetscInt          *cmap = mat->garray;
2326   PetscInt          *diagIdx, *offdiagIdx;
2327   Vec                diagV, offdiagV;
2328   PetscScalar       *a, *diagA, *offdiagA;
2329   const PetscScalar *ba, *bav;
2330   PetscInt           r, j, col, ncols, *bi, *bj;
2331   Mat                B = mat->B;
2332   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2333 
2334   PetscFunctionBegin;
2335   /* When a process holds entire A and other processes have no entry */
2336   if (A->cmap->N == n) {
2337     PetscCall(VecGetArrayWrite(v, &diagA));
2338     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2339     PetscCall(MatGetRowMin(mat->A, diagV, idx));
2340     PetscCall(VecDestroy(&diagV));
2341     PetscCall(VecRestoreArrayWrite(v, &diagA));
2342     PetscFunctionReturn(PETSC_SUCCESS);
2343   } else if (n == 0) {
2344     if (m) {
2345       PetscCall(VecGetArrayWrite(v, &a));
2346       for (r = 0; r < m; r++) {
2347         a[r] = PETSC_MAX_REAL;
2348         if (idx) idx[r] = -1;
2349       }
2350       PetscCall(VecRestoreArrayWrite(v, &a));
2351     }
2352     PetscFunctionReturn(PETSC_SUCCESS);
2353   }
2354 
2355   PetscCall(PetscCalloc2(m, &diagIdx, m, &offdiagIdx));
2356   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2357   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2358   PetscCall(MatGetRowMin(mat->A, diagV, diagIdx));
2359 
2360   /* Get offdiagIdx[] for implicit 0.0 */
2361   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2362   ba = bav;
2363   bi = b->i;
2364   bj = b->j;
2365   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2366   for (r = 0; r < m; r++) {
2367     ncols = bi[r + 1] - bi[r];
2368     if (ncols == A->cmap->N - n) { /* Brow is dense */
2369       offdiagA[r]   = *ba;
2370       offdiagIdx[r] = cmap[0];
2371     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2372       offdiagA[r] = 0.0;
2373 
2374       /* Find first hole in the cmap */
2375       for (j = 0; j < ncols; j++) {
2376         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2377         if (col > j && j < cstart) {
2378           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2379           break;
2380         } else if (col > j + n && j >= cstart) {
2381           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2382           break;
2383         }
2384       }
2385       if (j == ncols && ncols < A->cmap->N - n) {
2386         /* a hole is outside compressed Bcols */
2387         if (ncols == 0) {
2388           if (cstart) {
2389             offdiagIdx[r] = 0;
2390           } else offdiagIdx[r] = cend;
2391         } else { /* ncols > 0 */
2392           offdiagIdx[r] = cmap[ncols - 1] + 1;
2393           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2394         }
2395       }
2396     }
2397 
2398     for (j = 0; j < ncols; j++) {
2399       if (PetscRealPart(offdiagA[r]) > PetscRealPart(*ba)) {
2400         offdiagA[r]   = *ba;
2401         offdiagIdx[r] = cmap[*bj];
2402       }
2403       ba++;
2404       bj++;
2405     }
2406   }
2407 
2408   PetscCall(VecGetArrayWrite(v, &a));
2409   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2410   for (r = 0; r < m; ++r) {
2411     if (PetscRealPart(diagA[r]) < PetscRealPart(offdiagA[r])) {
2412       a[r] = diagA[r];
2413       if (idx) idx[r] = cstart + diagIdx[r];
2414     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2415       a[r] = diagA[r];
2416       if (idx) {
2417         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2418           idx[r] = cstart + diagIdx[r];
2419         } else idx[r] = offdiagIdx[r];
2420       }
2421     } else {
2422       a[r] = offdiagA[r];
2423       if (idx) idx[r] = offdiagIdx[r];
2424     }
2425   }
2426   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2427   PetscCall(VecRestoreArrayWrite(v, &a));
2428   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2429   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2430   PetscCall(VecDestroy(&diagV));
2431   PetscCall(VecDestroy(&offdiagV));
2432   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2433   PetscFunctionReturn(PETSC_SUCCESS);
2434 }
2435 
2436 static PetscErrorCode MatGetRowMax_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2437 {
2438   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2439   PetscInt           m = A->rmap->n, n = A->cmap->n;
2440   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2441   PetscInt          *cmap = mat->garray;
2442   PetscInt          *diagIdx, *offdiagIdx;
2443   Vec                diagV, offdiagV;
2444   PetscScalar       *a, *diagA, *offdiagA;
2445   const PetscScalar *ba, *bav;
2446   PetscInt           r, j, col, ncols, *bi, *bj;
2447   Mat                B = mat->B;
2448   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2449 
2450   PetscFunctionBegin;
2451   /* When a process holds entire A and other processes have no entry */
2452   if (A->cmap->N == n) {
2453     PetscCall(VecGetArrayWrite(v, &diagA));
2454     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2455     PetscCall(MatGetRowMax(mat->A, diagV, idx));
2456     PetscCall(VecDestroy(&diagV));
2457     PetscCall(VecRestoreArrayWrite(v, &diagA));
2458     PetscFunctionReturn(PETSC_SUCCESS);
2459   } else if (n == 0) {
2460     if (m) {
2461       PetscCall(VecGetArrayWrite(v, &a));
2462       for (r = 0; r < m; r++) {
2463         a[r] = PETSC_MIN_REAL;
2464         if (idx) idx[r] = -1;
2465       }
2466       PetscCall(VecRestoreArrayWrite(v, &a));
2467     }
2468     PetscFunctionReturn(PETSC_SUCCESS);
2469   }
2470 
2471   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2472   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2473   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2474   PetscCall(MatGetRowMax(mat->A, diagV, diagIdx));
2475 
2476   /* Get offdiagIdx[] for implicit 0.0 */
2477   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2478   ba = bav;
2479   bi = b->i;
2480   bj = b->j;
2481   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2482   for (r = 0; r < m; r++) {
2483     ncols = bi[r + 1] - bi[r];
2484     if (ncols == A->cmap->N - n) { /* Brow is dense */
2485       offdiagA[r]   = *ba;
2486       offdiagIdx[r] = cmap[0];
2487     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2488       offdiagA[r] = 0.0;
2489 
2490       /* Find first hole in the cmap */
2491       for (j = 0; j < ncols; j++) {
2492         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2493         if (col > j && j < cstart) {
2494           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2495           break;
2496         } else if (col > j + n && j >= cstart) {
2497           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2498           break;
2499         }
2500       }
2501       if (j == ncols && ncols < A->cmap->N - n) {
2502         /* a hole is outside compressed Bcols */
2503         if (ncols == 0) {
2504           if (cstart) {
2505             offdiagIdx[r] = 0;
2506           } else offdiagIdx[r] = cend;
2507         } else { /* ncols > 0 */
2508           offdiagIdx[r] = cmap[ncols - 1] + 1;
2509           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2510         }
2511       }
2512     }
2513 
2514     for (j = 0; j < ncols; j++) {
2515       if (PetscRealPart(offdiagA[r]) < PetscRealPart(*ba)) {
2516         offdiagA[r]   = *ba;
2517         offdiagIdx[r] = cmap[*bj];
2518       }
2519       ba++;
2520       bj++;
2521     }
2522   }
2523 
2524   PetscCall(VecGetArrayWrite(v, &a));
2525   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2526   for (r = 0; r < m; ++r) {
2527     if (PetscRealPart(diagA[r]) > PetscRealPart(offdiagA[r])) {
2528       a[r] = diagA[r];
2529       if (idx) idx[r] = cstart + diagIdx[r];
2530     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2531       a[r] = diagA[r];
2532       if (idx) {
2533         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2534           idx[r] = cstart + diagIdx[r];
2535         } else idx[r] = offdiagIdx[r];
2536       }
2537     } else {
2538       a[r] = offdiagA[r];
2539       if (idx) idx[r] = offdiagIdx[r];
2540     }
2541   }
2542   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2543   PetscCall(VecRestoreArrayWrite(v, &a));
2544   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2545   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2546   PetscCall(VecDestroy(&diagV));
2547   PetscCall(VecDestroy(&offdiagV));
2548   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2549   PetscFunctionReturn(PETSC_SUCCESS);
2550 }
2551 
2552 PetscErrorCode MatGetSeqNonzeroStructure_MPIAIJ(Mat mat, Mat *newmat)
2553 {
2554   Mat *dummy;
2555 
2556   PetscFunctionBegin;
2557   PetscCall(MatCreateSubMatrix_MPIAIJ_All(mat, MAT_DO_NOT_GET_VALUES, MAT_INITIAL_MATRIX, &dummy));
2558   *newmat = *dummy;
2559   PetscCall(PetscFree(dummy));
2560   PetscFunctionReturn(PETSC_SUCCESS);
2561 }
2562 
2563 static PetscErrorCode MatInvertBlockDiagonal_MPIAIJ(Mat A, const PetscScalar **values)
2564 {
2565   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2566 
2567   PetscFunctionBegin;
2568   PetscCall(MatInvertBlockDiagonal(a->A, values));
2569   A->factorerrortype = a->A->factorerrortype;
2570   PetscFunctionReturn(PETSC_SUCCESS);
2571 }
2572 
2573 static PetscErrorCode MatSetRandom_MPIAIJ(Mat x, PetscRandom rctx)
2574 {
2575   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)x->data;
2576 
2577   PetscFunctionBegin;
2578   PetscCheck(x->assembled || x->preallocated, PetscObjectComm((PetscObject)x), PETSC_ERR_ARG_WRONGSTATE, "MatSetRandom on an unassembled and unpreallocated MATMPIAIJ is not allowed");
2579   PetscCall(MatSetRandom(aij->A, rctx));
2580   if (x->assembled) {
2581     PetscCall(MatSetRandom(aij->B, rctx));
2582   } else {
2583     PetscCall(MatSetRandomSkipColumnRange_SeqAIJ_Private(aij->B, x->cmap->rstart, x->cmap->rend, rctx));
2584   }
2585   PetscCall(MatAssemblyBegin(x, MAT_FINAL_ASSEMBLY));
2586   PetscCall(MatAssemblyEnd(x, MAT_FINAL_ASSEMBLY));
2587   PetscFunctionReturn(PETSC_SUCCESS);
2588 }
2589 
2590 static PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ(Mat A, PetscBool sc)
2591 {
2592   PetscFunctionBegin;
2593   if (sc) A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ_Scalable;
2594   else A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ;
2595   PetscFunctionReturn(PETSC_SUCCESS);
2596 }
2597 
2598 /*@
2599   MatMPIAIJGetNumberNonzeros - gets the number of nonzeros in the matrix on this MPI rank
2600 
2601   Not Collective
2602 
2603   Input Parameter:
2604 . A - the matrix
2605 
2606   Output Parameter:
2607 . nz - the number of nonzeros
2608 
2609   Level: advanced
2610 
2611 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2612 @*/
2613 PetscErrorCode MatMPIAIJGetNumberNonzeros(Mat A, PetscCount *nz)
2614 {
2615   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)A->data;
2616   Mat_SeqAIJ *aaij = (Mat_SeqAIJ *)maij->A->data, *baij = (Mat_SeqAIJ *)maij->B->data;
2617   PetscBool   isaij;
2618 
2619   PetscFunctionBegin;
2620   PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIAIJ, &isaij));
2621   PetscCheck(isaij, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
2622   *nz = aaij->i[A->rmap->n] + baij->i[A->rmap->n];
2623   PetscFunctionReturn(PETSC_SUCCESS);
2624 }
2625 
2626 /*@
2627   MatMPIAIJSetUseScalableIncreaseOverlap - Determine if the matrix uses a scalable algorithm to compute the overlap
2628 
2629   Collective
2630 
2631   Input Parameters:
2632 + A  - the matrix
2633 - sc - `PETSC_TRUE` indicates use the scalable algorithm (default is not to use the scalable algorithm)
2634 
2635   Level: advanced
2636 
2637 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2638 @*/
2639 PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap(Mat A, PetscBool sc)
2640 {
2641   PetscFunctionBegin;
2642   PetscTryMethod(A, "MatMPIAIJSetUseScalableIncreaseOverlap_C", (Mat, PetscBool), (A, sc));
2643   PetscFunctionReturn(PETSC_SUCCESS);
2644 }
2645 
2646 PetscErrorCode MatSetFromOptions_MPIAIJ(Mat A, PetscOptionItems *PetscOptionsObject)
2647 {
2648   PetscBool sc = PETSC_FALSE, flg;
2649 
2650   PetscFunctionBegin;
2651   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJ options");
2652   if (A->ops->increaseoverlap == MatIncreaseOverlap_MPIAIJ_Scalable) sc = PETSC_TRUE;
2653   PetscCall(PetscOptionsBool("-mat_increase_overlap_scalable", "Use a scalable algorithm to compute the overlap", "MatIncreaseOverlap", sc, &sc, &flg));
2654   if (flg) PetscCall(MatMPIAIJSetUseScalableIncreaseOverlap(A, sc));
2655   PetscOptionsHeadEnd();
2656   PetscFunctionReturn(PETSC_SUCCESS);
2657 }
2658 
2659 static PetscErrorCode MatShift_MPIAIJ(Mat Y, PetscScalar a)
2660 {
2661   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)Y->data;
2662   Mat_SeqAIJ *aij  = (Mat_SeqAIJ *)maij->A->data;
2663 
2664   PetscFunctionBegin;
2665   if (!Y->preallocated) {
2666     PetscCall(MatMPIAIJSetPreallocation(Y, 1, NULL, 0, NULL));
2667   } else if (!aij->nz) { /* It does not matter if diagonals of Y only partially lie in maij->A. We just need an estimated preallocation. */
2668     PetscInt nonew = aij->nonew;
2669     PetscCall(MatSeqAIJSetPreallocation(maij->A, 1, NULL));
2670     aij->nonew = nonew;
2671   }
2672   PetscCall(MatShift_Basic(Y, a));
2673   PetscFunctionReturn(PETSC_SUCCESS);
2674 }
2675 
2676 static PetscErrorCode MatMissingDiagonal_MPIAIJ(Mat A, PetscBool *missing, PetscInt *d)
2677 {
2678   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2679 
2680   PetscFunctionBegin;
2681   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only works for square matrices");
2682   PetscCall(MatMissingDiagonal(a->A, missing, d));
2683   if (d) {
2684     PetscInt rstart;
2685     PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
2686     *d += rstart;
2687   }
2688   PetscFunctionReturn(PETSC_SUCCESS);
2689 }
2690 
2691 static PetscErrorCode MatInvertVariableBlockDiagonal_MPIAIJ(Mat A, PetscInt nblocks, const PetscInt *bsizes, PetscScalar *diag)
2692 {
2693   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2694 
2695   PetscFunctionBegin;
2696   PetscCall(MatInvertVariableBlockDiagonal(a->A, nblocks, bsizes, diag));
2697   PetscFunctionReturn(PETSC_SUCCESS);
2698 }
2699 
2700 static PetscErrorCode MatEliminateZeros_MPIAIJ(Mat A, PetscBool keep)
2701 {
2702   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2703 
2704   PetscFunctionBegin;
2705   PetscCall(MatEliminateZeros_SeqAIJ(a->A, keep));        // possibly keep zero diagonal coefficients
2706   PetscCall(MatEliminateZeros_SeqAIJ(a->B, PETSC_FALSE)); // never keep zero diagonal coefficients
2707   PetscFunctionReturn(PETSC_SUCCESS);
2708 }
2709 
2710 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
2711                                        MatGetRow_MPIAIJ,
2712                                        MatRestoreRow_MPIAIJ,
2713                                        MatMult_MPIAIJ,
2714                                        /* 4*/ MatMultAdd_MPIAIJ,
2715                                        MatMultTranspose_MPIAIJ,
2716                                        MatMultTransposeAdd_MPIAIJ,
2717                                        NULL,
2718                                        NULL,
2719                                        NULL,
2720                                        /*10*/ NULL,
2721                                        NULL,
2722                                        NULL,
2723                                        MatSOR_MPIAIJ,
2724                                        MatTranspose_MPIAIJ,
2725                                        /*15*/ MatGetInfo_MPIAIJ,
2726                                        MatEqual_MPIAIJ,
2727                                        MatGetDiagonal_MPIAIJ,
2728                                        MatDiagonalScale_MPIAIJ,
2729                                        MatNorm_MPIAIJ,
2730                                        /*20*/ MatAssemblyBegin_MPIAIJ,
2731                                        MatAssemblyEnd_MPIAIJ,
2732                                        MatSetOption_MPIAIJ,
2733                                        MatZeroEntries_MPIAIJ,
2734                                        /*24*/ MatZeroRows_MPIAIJ,
2735                                        NULL,
2736                                        NULL,
2737                                        NULL,
2738                                        NULL,
2739                                        /*29*/ MatSetUp_MPI_Hash,
2740                                        NULL,
2741                                        NULL,
2742                                        MatGetDiagonalBlock_MPIAIJ,
2743                                        NULL,
2744                                        /*34*/ MatDuplicate_MPIAIJ,
2745                                        NULL,
2746                                        NULL,
2747                                        NULL,
2748                                        NULL,
2749                                        /*39*/ MatAXPY_MPIAIJ,
2750                                        MatCreateSubMatrices_MPIAIJ,
2751                                        MatIncreaseOverlap_MPIAIJ,
2752                                        MatGetValues_MPIAIJ,
2753                                        MatCopy_MPIAIJ,
2754                                        /*44*/ MatGetRowMax_MPIAIJ,
2755                                        MatScale_MPIAIJ,
2756                                        MatShift_MPIAIJ,
2757                                        MatDiagonalSet_MPIAIJ,
2758                                        MatZeroRowsColumns_MPIAIJ,
2759                                        /*49*/ MatSetRandom_MPIAIJ,
2760                                        MatGetRowIJ_MPIAIJ,
2761                                        MatRestoreRowIJ_MPIAIJ,
2762                                        NULL,
2763                                        NULL,
2764                                        /*54*/ MatFDColoringCreate_MPIXAIJ,
2765                                        NULL,
2766                                        MatSetUnfactored_MPIAIJ,
2767                                        MatPermute_MPIAIJ,
2768                                        NULL,
2769                                        /*59*/ MatCreateSubMatrix_MPIAIJ,
2770                                        MatDestroy_MPIAIJ,
2771                                        MatView_MPIAIJ,
2772                                        NULL,
2773                                        NULL,
2774                                        /*64*/ NULL,
2775                                        MatMatMatMultNumeric_MPIAIJ_MPIAIJ_MPIAIJ,
2776                                        NULL,
2777                                        NULL,
2778                                        NULL,
2779                                        /*69*/ MatGetRowMaxAbs_MPIAIJ,
2780                                        MatGetRowMinAbs_MPIAIJ,
2781                                        NULL,
2782                                        NULL,
2783                                        NULL,
2784                                        NULL,
2785                                        /*75*/ MatFDColoringApply_AIJ,
2786                                        MatSetFromOptions_MPIAIJ,
2787                                        NULL,
2788                                        NULL,
2789                                        MatFindZeroDiagonals_MPIAIJ,
2790                                        /*80*/ NULL,
2791                                        NULL,
2792                                        NULL,
2793                                        /*83*/ MatLoad_MPIAIJ,
2794                                        MatIsSymmetric_MPIAIJ,
2795                                        NULL,
2796                                        NULL,
2797                                        NULL,
2798                                        NULL,
2799                                        /*89*/ NULL,
2800                                        NULL,
2801                                        MatMatMultNumeric_MPIAIJ_MPIAIJ,
2802                                        NULL,
2803                                        NULL,
2804                                        /*94*/ MatPtAPNumeric_MPIAIJ_MPIAIJ,
2805                                        NULL,
2806                                        NULL,
2807                                        NULL,
2808                                        MatBindToCPU_MPIAIJ,
2809                                        /*99*/ MatProductSetFromOptions_MPIAIJ,
2810                                        NULL,
2811                                        NULL,
2812                                        MatConjugate_MPIAIJ,
2813                                        NULL,
2814                                        /*104*/ MatSetValuesRow_MPIAIJ,
2815                                        MatRealPart_MPIAIJ,
2816                                        MatImaginaryPart_MPIAIJ,
2817                                        NULL,
2818                                        NULL,
2819                                        /*109*/ NULL,
2820                                        NULL,
2821                                        MatGetRowMin_MPIAIJ,
2822                                        NULL,
2823                                        MatMissingDiagonal_MPIAIJ,
2824                                        /*114*/ MatGetSeqNonzeroStructure_MPIAIJ,
2825                                        NULL,
2826                                        MatGetGhosts_MPIAIJ,
2827                                        NULL,
2828                                        NULL,
2829                                        /*119*/ MatMultDiagonalBlock_MPIAIJ,
2830                                        NULL,
2831                                        NULL,
2832                                        NULL,
2833                                        MatGetMultiProcBlock_MPIAIJ,
2834                                        /*124*/ MatFindNonzeroRows_MPIAIJ,
2835                                        MatGetColumnReductions_MPIAIJ,
2836                                        MatInvertBlockDiagonal_MPIAIJ,
2837                                        MatInvertVariableBlockDiagonal_MPIAIJ,
2838                                        MatCreateSubMatricesMPI_MPIAIJ,
2839                                        /*129*/ NULL,
2840                                        NULL,
2841                                        NULL,
2842                                        MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ,
2843                                        NULL,
2844                                        /*134*/ NULL,
2845                                        NULL,
2846                                        NULL,
2847                                        NULL,
2848                                        NULL,
2849                                        /*139*/ MatSetBlockSizes_MPIAIJ,
2850                                        NULL,
2851                                        NULL,
2852                                        MatFDColoringSetUp_MPIXAIJ,
2853                                        MatFindOffBlockDiagonalEntries_MPIAIJ,
2854                                        MatCreateMPIMatConcatenateSeqMat_MPIAIJ,
2855                                        /*145*/ NULL,
2856                                        NULL,
2857                                        NULL,
2858                                        MatCreateGraph_Simple_AIJ,
2859                                        NULL,
2860                                        /*150*/ NULL,
2861                                        MatEliminateZeros_MPIAIJ};
2862 
2863 static PetscErrorCode MatStoreValues_MPIAIJ(Mat mat)
2864 {
2865   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2866 
2867   PetscFunctionBegin;
2868   PetscCall(MatStoreValues(aij->A));
2869   PetscCall(MatStoreValues(aij->B));
2870   PetscFunctionReturn(PETSC_SUCCESS);
2871 }
2872 
2873 static PetscErrorCode MatRetrieveValues_MPIAIJ(Mat mat)
2874 {
2875   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2876 
2877   PetscFunctionBegin;
2878   PetscCall(MatRetrieveValues(aij->A));
2879   PetscCall(MatRetrieveValues(aij->B));
2880   PetscFunctionReturn(PETSC_SUCCESS);
2881 }
2882 
2883 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJ(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
2884 {
2885   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2886   PetscMPIInt size;
2887 
2888   PetscFunctionBegin;
2889   if (B->hash_active) {
2890     B->ops[0]      = b->cops;
2891     B->hash_active = PETSC_FALSE;
2892   }
2893   PetscCall(PetscLayoutSetUp(B->rmap));
2894   PetscCall(PetscLayoutSetUp(B->cmap));
2895 
2896 #if defined(PETSC_USE_CTABLE)
2897   PetscCall(PetscHMapIDestroy(&b->colmap));
2898 #else
2899   PetscCall(PetscFree(b->colmap));
2900 #endif
2901   PetscCall(PetscFree(b->garray));
2902   PetscCall(VecDestroy(&b->lvec));
2903   PetscCall(VecScatterDestroy(&b->Mvctx));
2904 
2905   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2906   PetscCall(MatDestroy(&b->B));
2907   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
2908   PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
2909   PetscCall(MatSetBlockSizesFromMats(b->B, B, B));
2910   PetscCall(MatSetType(b->B, MATSEQAIJ));
2911 
2912   PetscCall(MatDestroy(&b->A));
2913   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2914   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2915   PetscCall(MatSetBlockSizesFromMats(b->A, B, B));
2916   PetscCall(MatSetType(b->A, MATSEQAIJ));
2917 
2918   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
2919   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
2920   B->preallocated  = PETSC_TRUE;
2921   B->was_assembled = PETSC_FALSE;
2922   B->assembled     = PETSC_FALSE;
2923   PetscFunctionReturn(PETSC_SUCCESS);
2924 }
2925 
2926 static PetscErrorCode MatResetPreallocation_MPIAIJ(Mat B)
2927 {
2928   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2929 
2930   PetscFunctionBegin;
2931   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
2932   PetscCall(PetscLayoutSetUp(B->rmap));
2933   PetscCall(PetscLayoutSetUp(B->cmap));
2934 
2935 #if defined(PETSC_USE_CTABLE)
2936   PetscCall(PetscHMapIDestroy(&b->colmap));
2937 #else
2938   PetscCall(PetscFree(b->colmap));
2939 #endif
2940   PetscCall(PetscFree(b->garray));
2941   PetscCall(VecDestroy(&b->lvec));
2942   PetscCall(VecScatterDestroy(&b->Mvctx));
2943 
2944   PetscCall(MatResetPreallocation(b->A));
2945   PetscCall(MatResetPreallocation(b->B));
2946   B->preallocated  = PETSC_TRUE;
2947   B->was_assembled = PETSC_FALSE;
2948   B->assembled     = PETSC_FALSE;
2949   PetscFunctionReturn(PETSC_SUCCESS);
2950 }
2951 
2952 PetscErrorCode MatDuplicate_MPIAIJ(Mat matin, MatDuplicateOption cpvalues, Mat *newmat)
2953 {
2954   Mat         mat;
2955   Mat_MPIAIJ *a, *oldmat = (Mat_MPIAIJ *)matin->data;
2956 
2957   PetscFunctionBegin;
2958   *newmat = NULL;
2959   PetscCall(MatCreate(PetscObjectComm((PetscObject)matin), &mat));
2960   PetscCall(MatSetSizes(mat, matin->rmap->n, matin->cmap->n, matin->rmap->N, matin->cmap->N));
2961   PetscCall(MatSetBlockSizesFromMats(mat, matin, matin));
2962   PetscCall(MatSetType(mat, ((PetscObject)matin)->type_name));
2963   a = (Mat_MPIAIJ *)mat->data;
2964 
2965   mat->factortype = matin->factortype;
2966   mat->assembled  = matin->assembled;
2967   mat->insertmode = NOT_SET_VALUES;
2968 
2969   a->size         = oldmat->size;
2970   a->rank         = oldmat->rank;
2971   a->donotstash   = oldmat->donotstash;
2972   a->roworiented  = oldmat->roworiented;
2973   a->rowindices   = NULL;
2974   a->rowvalues    = NULL;
2975   a->getrowactive = PETSC_FALSE;
2976 
2977   PetscCall(PetscLayoutReference(matin->rmap, &mat->rmap));
2978   PetscCall(PetscLayoutReference(matin->cmap, &mat->cmap));
2979   if (matin->hash_active) {
2980     PetscCall(MatSetUp(mat));
2981   } else {
2982     mat->preallocated = matin->preallocated;
2983     if (oldmat->colmap) {
2984 #if defined(PETSC_USE_CTABLE)
2985       PetscCall(PetscHMapIDuplicate(oldmat->colmap, &a->colmap));
2986 #else
2987       PetscCall(PetscMalloc1(mat->cmap->N, &a->colmap));
2988       PetscCall(PetscArraycpy(a->colmap, oldmat->colmap, mat->cmap->N));
2989 #endif
2990     } else a->colmap = NULL;
2991     if (oldmat->garray) {
2992       PetscInt len;
2993       len = oldmat->B->cmap->n;
2994       PetscCall(PetscMalloc1(len + 1, &a->garray));
2995       if (len) PetscCall(PetscArraycpy(a->garray, oldmat->garray, len));
2996     } else a->garray = NULL;
2997 
2998     /* It may happen MatDuplicate is called with a non-assembled matrix
2999       In fact, MatDuplicate only requires the matrix to be preallocated
3000       This may happen inside a DMCreateMatrix_Shell */
3001     if (oldmat->lvec) PetscCall(VecDuplicate(oldmat->lvec, &a->lvec));
3002     if (oldmat->Mvctx) PetscCall(VecScatterCopy(oldmat->Mvctx, &a->Mvctx));
3003     PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));
3004     PetscCall(MatDuplicate(oldmat->B, cpvalues, &a->B));
3005   }
3006   PetscCall(PetscFunctionListDuplicate(((PetscObject)matin)->qlist, &((PetscObject)mat)->qlist));
3007   *newmat = mat;
3008   PetscFunctionReturn(PETSC_SUCCESS);
3009 }
3010 
3011 PetscErrorCode MatLoad_MPIAIJ(Mat newMat, PetscViewer viewer)
3012 {
3013   PetscBool isbinary, ishdf5;
3014 
3015   PetscFunctionBegin;
3016   PetscValidHeaderSpecific(newMat, MAT_CLASSID, 1);
3017   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
3018   /* force binary viewer to load .info file if it has not yet done so */
3019   PetscCall(PetscViewerSetUp(viewer));
3020   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3021   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERHDF5, &ishdf5));
3022   if (isbinary) {
3023     PetscCall(MatLoad_MPIAIJ_Binary(newMat, viewer));
3024   } else if (ishdf5) {
3025 #if defined(PETSC_HAVE_HDF5)
3026     PetscCall(MatLoad_AIJ_HDF5(newMat, viewer));
3027 #else
3028     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "HDF5 not supported in this build.\nPlease reconfigure using --download-hdf5");
3029 #endif
3030   } else {
3031     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)newMat)->type_name);
3032   }
3033   PetscFunctionReturn(PETSC_SUCCESS);
3034 }
3035 
3036 PetscErrorCode MatLoad_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
3037 {
3038   PetscInt     header[4], M, N, m, nz, rows, cols, sum, i;
3039   PetscInt    *rowidxs, *colidxs;
3040   PetscScalar *matvals;
3041 
3042   PetscFunctionBegin;
3043   PetscCall(PetscViewerSetUp(viewer));
3044 
3045   /* read in matrix header */
3046   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3047   PetscCheck(header[0] == MAT_FILE_CLASSID, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3048   M  = header[1];
3049   N  = header[2];
3050   nz = header[3];
3051   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3052   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3053   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as MPIAIJ");
3054 
3055   /* set block sizes from the viewer's .info file */
3056   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3057   /* set global sizes if not set already */
3058   if (mat->rmap->N < 0) mat->rmap->N = M;
3059   if (mat->cmap->N < 0) mat->cmap->N = N;
3060   PetscCall(PetscLayoutSetUp(mat->rmap));
3061   PetscCall(PetscLayoutSetUp(mat->cmap));
3062 
3063   /* check if the matrix sizes are correct */
3064   PetscCall(MatGetSize(mat, &rows, &cols));
3065   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3066 
3067   /* read in row lengths and build row indices */
3068   PetscCall(MatGetLocalSize(mat, &m, NULL));
3069   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3070   PetscCall(PetscViewerBinaryReadAll(viewer, rowidxs + 1, m, PETSC_DECIDE, M, PETSC_INT));
3071   rowidxs[0] = 0;
3072   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3073   if (nz != PETSC_MAX_INT) {
3074     PetscCall(MPIU_Allreduce(&rowidxs[m], &sum, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)viewer)));
3075     PetscCheck(sum == nz, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);
3076   }
3077 
3078   /* read in column indices and matrix values */
3079   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, rowidxs[m], &matvals));
3080   PetscCall(PetscViewerBinaryReadAll(viewer, colidxs, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
3081   PetscCall(PetscViewerBinaryReadAll(viewer, matvals, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
3082   /* store matrix indices and values */
3083   PetscCall(MatMPIAIJSetPreallocationCSR(mat, rowidxs, colidxs, matvals));
3084   PetscCall(PetscFree(rowidxs));
3085   PetscCall(PetscFree2(colidxs, matvals));
3086   PetscFunctionReturn(PETSC_SUCCESS);
3087 }
3088 
3089 /* Not scalable because of ISAllGather() unless getting all columns. */
3090 static PetscErrorCode ISGetSeqIS_Private(Mat mat, IS iscol, IS *isseq)
3091 {
3092   IS          iscol_local;
3093   PetscBool   isstride;
3094   PetscMPIInt lisstride = 0, gisstride;
3095 
3096   PetscFunctionBegin;
3097   /* check if we are grabbing all columns*/
3098   PetscCall(PetscObjectTypeCompare((PetscObject)iscol, ISSTRIDE, &isstride));
3099 
3100   if (isstride) {
3101     PetscInt start, len, mstart, mlen;
3102     PetscCall(ISStrideGetInfo(iscol, &start, NULL));
3103     PetscCall(ISGetLocalSize(iscol, &len));
3104     PetscCall(MatGetOwnershipRangeColumn(mat, &mstart, &mlen));
3105     if (mstart == start && mlen - mstart == len) lisstride = 1;
3106   }
3107 
3108   PetscCall(MPIU_Allreduce(&lisstride, &gisstride, 1, MPI_INT, MPI_MIN, PetscObjectComm((PetscObject)mat)));
3109   if (gisstride) {
3110     PetscInt N;
3111     PetscCall(MatGetSize(mat, NULL, &N));
3112     PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &iscol_local));
3113     PetscCall(ISSetIdentity(iscol_local));
3114     PetscCall(PetscInfo(mat, "Optimizing for obtaining all columns of the matrix; skipping ISAllGather()\n"));
3115   } else {
3116     PetscInt cbs;
3117     PetscCall(ISGetBlockSize(iscol, &cbs));
3118     PetscCall(ISAllGather(iscol, &iscol_local));
3119     PetscCall(ISSetBlockSize(iscol_local, cbs));
3120   }
3121 
3122   *isseq = iscol_local;
3123   PetscFunctionReturn(PETSC_SUCCESS);
3124 }
3125 
3126 /*
3127  Used by MatCreateSubMatrix_MPIAIJ_SameRowColDist() to avoid ISAllGather() and global size of iscol_local
3128  (see MatCreateSubMatrix_MPIAIJ_nonscalable)
3129 
3130  Input Parameters:
3131 +   mat - matrix
3132 .   isrow - parallel row index set; its local indices are a subset of local columns of `mat`,
3133            i.e., mat->rstart <= isrow[i] < mat->rend
3134 -   iscol - parallel column index set; its local indices are a subset of local columns of `mat`,
3135            i.e., mat->cstart <= iscol[i] < mat->cend
3136 
3137  Output Parameters:
3138 +   isrow_d - sequential row index set for retrieving mat->A
3139 .   iscol_d - sequential  column index set for retrieving mat->A
3140 .   iscol_o - sequential column index set for retrieving mat->B
3141 -   garray - column map; garray[i] indicates global location of iscol_o[i] in `iscol`
3142  */
3143 static PetscErrorCode ISGetSeqIS_SameColDist_Private(Mat mat, IS isrow, IS iscol, IS *isrow_d, IS *iscol_d, IS *iscol_o, const PetscInt *garray[])
3144 {
3145   Vec             x, cmap;
3146   const PetscInt *is_idx;
3147   PetscScalar    *xarray, *cmaparray;
3148   PetscInt        ncols, isstart, *idx, m, rstart, *cmap1, count;
3149   Mat_MPIAIJ     *a    = (Mat_MPIAIJ *)mat->data;
3150   Mat             B    = a->B;
3151   Vec             lvec = a->lvec, lcmap;
3152   PetscInt        i, cstart, cend, Bn = B->cmap->N;
3153   MPI_Comm        comm;
3154   VecScatter      Mvctx = a->Mvctx;
3155 
3156   PetscFunctionBegin;
3157   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3158   PetscCall(ISGetLocalSize(iscol, &ncols));
3159 
3160   /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */
3161   PetscCall(MatCreateVecs(mat, &x, NULL));
3162   PetscCall(VecSet(x, -1.0));
3163   PetscCall(VecDuplicate(x, &cmap));
3164   PetscCall(VecSet(cmap, -1.0));
3165 
3166   /* Get start indices */
3167   PetscCallMPI(MPI_Scan(&ncols, &isstart, 1, MPIU_INT, MPI_SUM, comm));
3168   isstart -= ncols;
3169   PetscCall(MatGetOwnershipRangeColumn(mat, &cstart, &cend));
3170 
3171   PetscCall(ISGetIndices(iscol, &is_idx));
3172   PetscCall(VecGetArray(x, &xarray));
3173   PetscCall(VecGetArray(cmap, &cmaparray));
3174   PetscCall(PetscMalloc1(ncols, &idx));
3175   for (i = 0; i < ncols; i++) {
3176     xarray[is_idx[i] - cstart]    = (PetscScalar)is_idx[i];
3177     cmaparray[is_idx[i] - cstart] = i + isstart;        /* global index of iscol[i] */
3178     idx[i]                        = is_idx[i] - cstart; /* local index of iscol[i]  */
3179   }
3180   PetscCall(VecRestoreArray(x, &xarray));
3181   PetscCall(VecRestoreArray(cmap, &cmaparray));
3182   PetscCall(ISRestoreIndices(iscol, &is_idx));
3183 
3184   /* Get iscol_d */
3185   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, iscol_d));
3186   PetscCall(ISGetBlockSize(iscol, &i));
3187   PetscCall(ISSetBlockSize(*iscol_d, i));
3188 
3189   /* Get isrow_d */
3190   PetscCall(ISGetLocalSize(isrow, &m));
3191   rstart = mat->rmap->rstart;
3192   PetscCall(PetscMalloc1(m, &idx));
3193   PetscCall(ISGetIndices(isrow, &is_idx));
3194   for (i = 0; i < m; i++) idx[i] = is_idx[i] - rstart;
3195   PetscCall(ISRestoreIndices(isrow, &is_idx));
3196 
3197   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, m, idx, PETSC_OWN_POINTER, isrow_d));
3198   PetscCall(ISGetBlockSize(isrow, &i));
3199   PetscCall(ISSetBlockSize(*isrow_d, i));
3200 
3201   /* (2) Scatter x and cmap using aij->Mvctx to get their off-process portions (see MatMult_MPIAIJ) */
3202   PetscCall(VecScatterBegin(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3203   PetscCall(VecScatterEnd(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3204 
3205   PetscCall(VecDuplicate(lvec, &lcmap));
3206 
3207   PetscCall(VecScatterBegin(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3208   PetscCall(VecScatterEnd(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3209 
3210   /* (3) create sequential iscol_o (a subset of iscol) and isgarray */
3211   /* off-process column indices */
3212   count = 0;
3213   PetscCall(PetscMalloc1(Bn, &idx));
3214   PetscCall(PetscMalloc1(Bn, &cmap1));
3215 
3216   PetscCall(VecGetArray(lvec, &xarray));
3217   PetscCall(VecGetArray(lcmap, &cmaparray));
3218   for (i = 0; i < Bn; i++) {
3219     if (PetscRealPart(xarray[i]) > -1.0) {
3220       idx[count]   = i;                                     /* local column index in off-diagonal part B */
3221       cmap1[count] = (PetscInt)PetscRealPart(cmaparray[i]); /* column index in submat */
3222       count++;
3223     }
3224   }
3225   PetscCall(VecRestoreArray(lvec, &xarray));
3226   PetscCall(VecRestoreArray(lcmap, &cmaparray));
3227 
3228   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_COPY_VALUES, iscol_o));
3229   /* cannot ensure iscol_o has same blocksize as iscol! */
3230 
3231   PetscCall(PetscFree(idx));
3232   *garray = cmap1;
3233 
3234   PetscCall(VecDestroy(&x));
3235   PetscCall(VecDestroy(&cmap));
3236   PetscCall(VecDestroy(&lcmap));
3237   PetscFunctionReturn(PETSC_SUCCESS);
3238 }
3239 
3240 /* isrow and iscol have same processor distribution as mat, output *submat is a submatrix of local mat */
3241 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowColDist(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *submat)
3242 {
3243   Mat_MPIAIJ *a = (Mat_MPIAIJ *)mat->data, *asub;
3244   Mat         M = NULL;
3245   MPI_Comm    comm;
3246   IS          iscol_d, isrow_d, iscol_o;
3247   Mat         Asub = NULL, Bsub = NULL;
3248   PetscInt    n;
3249 
3250   PetscFunctionBegin;
3251   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3252 
3253   if (call == MAT_REUSE_MATRIX) {
3254     /* Retrieve isrow_d, iscol_d and iscol_o from submat */
3255     PetscCall(PetscObjectQuery((PetscObject)*submat, "isrow_d", (PetscObject *)&isrow_d));
3256     PetscCheck(isrow_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "isrow_d passed in was not used before, cannot reuse");
3257 
3258     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_d", (PetscObject *)&iscol_d));
3259     PetscCheck(iscol_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_d passed in was not used before, cannot reuse");
3260 
3261     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_o", (PetscObject *)&iscol_o));
3262     PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse");
3263 
3264     /* Update diagonal and off-diagonal portions of submat */
3265     asub = (Mat_MPIAIJ *)(*submat)->data;
3266     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->A));
3267     PetscCall(ISGetLocalSize(iscol_o, &n));
3268     if (n) PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->B));
3269     PetscCall(MatAssemblyBegin(*submat, MAT_FINAL_ASSEMBLY));
3270     PetscCall(MatAssemblyEnd(*submat, MAT_FINAL_ASSEMBLY));
3271 
3272   } else { /* call == MAT_INITIAL_MATRIX) */
3273     const PetscInt *garray;
3274     PetscInt        BsubN;
3275 
3276     /* Create isrow_d, iscol_d, iscol_o and isgarray (replace isgarray with array?) */
3277     PetscCall(ISGetSeqIS_SameColDist_Private(mat, isrow, iscol, &isrow_d, &iscol_d, &iscol_o, &garray));
3278 
3279     /* Create local submatrices Asub and Bsub */
3280     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Asub));
3281     PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Bsub));
3282 
3283     /* Create submatrix M */
3284     PetscCall(MatCreateMPIAIJWithSeqAIJ(comm, Asub, Bsub, garray, &M));
3285 
3286     /* If Bsub has empty columns, compress iscol_o such that it will retrieve condensed Bsub from a->B during reuse */
3287     asub = (Mat_MPIAIJ *)M->data;
3288 
3289     PetscCall(ISGetLocalSize(iscol_o, &BsubN));
3290     n = asub->B->cmap->N;
3291     if (BsubN > n) {
3292       /* This case can be tested using ~petsc/src/tao/bound/tutorials/runplate2_3 */
3293       const PetscInt *idx;
3294       PetscInt        i, j, *idx_new, *subgarray = asub->garray;
3295       PetscCall(PetscInfo(M, "submatrix Bn %" PetscInt_FMT " != BsubN %" PetscInt_FMT ", update iscol_o\n", n, BsubN));
3296 
3297       PetscCall(PetscMalloc1(n, &idx_new));
3298       j = 0;
3299       PetscCall(ISGetIndices(iscol_o, &idx));
3300       for (i = 0; i < n; i++) {
3301         if (j >= BsubN) break;
3302         while (subgarray[i] > garray[j]) j++;
3303 
3304         if (subgarray[i] == garray[j]) {
3305           idx_new[i] = idx[j++];
3306         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "subgarray[%" PetscInt_FMT "]=%" PetscInt_FMT " cannot < garray[%" PetscInt_FMT "]=%" PetscInt_FMT, i, subgarray[i], j, garray[j]);
3307       }
3308       PetscCall(ISRestoreIndices(iscol_o, &idx));
3309 
3310       PetscCall(ISDestroy(&iscol_o));
3311       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, idx_new, PETSC_OWN_POINTER, &iscol_o));
3312 
3313     } else if (BsubN < n) {
3314       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Columns of Bsub (%" PetscInt_FMT ") cannot be smaller than B's (%" PetscInt_FMT ")", BsubN, asub->B->cmap->N);
3315     }
3316 
3317     PetscCall(PetscFree(garray));
3318     *submat = M;
3319 
3320     /* Save isrow_d, iscol_d and iscol_o used in processor for next request */
3321     PetscCall(PetscObjectCompose((PetscObject)M, "isrow_d", (PetscObject)isrow_d));
3322     PetscCall(ISDestroy(&isrow_d));
3323 
3324     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_d", (PetscObject)iscol_d));
3325     PetscCall(ISDestroy(&iscol_d));
3326 
3327     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_o", (PetscObject)iscol_o));
3328     PetscCall(ISDestroy(&iscol_o));
3329   }
3330   PetscFunctionReturn(PETSC_SUCCESS);
3331 }
3332 
3333 PetscErrorCode MatCreateSubMatrix_MPIAIJ(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *newmat)
3334 {
3335   IS        iscol_local = NULL, isrow_d;
3336   PetscInt  csize;
3337   PetscInt  n, i, j, start, end;
3338   PetscBool sameRowDist = PETSC_FALSE, sameDist[2], tsameDist[2];
3339   MPI_Comm  comm;
3340 
3341   PetscFunctionBegin;
3342   /* If isrow has same processor distribution as mat,
3343      call MatCreateSubMatrix_MPIAIJ_SameRowDist() to avoid using a hash table with global size of iscol */
3344   if (call == MAT_REUSE_MATRIX) {
3345     PetscCall(PetscObjectQuery((PetscObject)*newmat, "isrow_d", (PetscObject *)&isrow_d));
3346     if (isrow_d) {
3347       sameRowDist  = PETSC_TRUE;
3348       tsameDist[1] = PETSC_TRUE; /* sameColDist */
3349     } else {
3350       PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_local));
3351       if (iscol_local) {
3352         sameRowDist  = PETSC_TRUE;
3353         tsameDist[1] = PETSC_FALSE; /* !sameColDist */
3354       }
3355     }
3356   } else {
3357     /* Check if isrow has same processor distribution as mat */
3358     sameDist[0] = PETSC_FALSE;
3359     PetscCall(ISGetLocalSize(isrow, &n));
3360     if (!n) {
3361       sameDist[0] = PETSC_TRUE;
3362     } else {
3363       PetscCall(ISGetMinMax(isrow, &i, &j));
3364       PetscCall(MatGetOwnershipRange(mat, &start, &end));
3365       if (i >= start && j < end) sameDist[0] = PETSC_TRUE;
3366     }
3367 
3368     /* Check if iscol has same processor distribution as mat */
3369     sameDist[1] = PETSC_FALSE;
3370     PetscCall(ISGetLocalSize(iscol, &n));
3371     if (!n) {
3372       sameDist[1] = PETSC_TRUE;
3373     } else {
3374       PetscCall(ISGetMinMax(iscol, &i, &j));
3375       PetscCall(MatGetOwnershipRangeColumn(mat, &start, &end));
3376       if (i >= start && j < end) sameDist[1] = PETSC_TRUE;
3377     }
3378 
3379     PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3380     PetscCall(MPIU_Allreduce(&sameDist, &tsameDist, 2, MPIU_BOOL, MPI_LAND, comm));
3381     sameRowDist = tsameDist[0];
3382   }
3383 
3384   if (sameRowDist) {
3385     if (tsameDist[1]) { /* sameRowDist & sameColDist */
3386       /* isrow and iscol have same processor distribution as mat */
3387       PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowColDist(mat, isrow, iscol, call, newmat));
3388       PetscFunctionReturn(PETSC_SUCCESS);
3389     } else { /* sameRowDist */
3390       /* isrow has same processor distribution as mat */
3391       if (call == MAT_INITIAL_MATRIX) {
3392         PetscBool sorted;
3393         PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3394         PetscCall(ISGetLocalSize(iscol_local, &n)); /* local size of iscol_local = global columns of newmat */
3395         PetscCall(ISGetSize(iscol, &i));
3396         PetscCheck(n == i, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "n %" PetscInt_FMT " != size of iscol %" PetscInt_FMT, n, i);
3397 
3398         PetscCall(ISSorted(iscol_local, &sorted));
3399         if (sorted) {
3400           /* MatCreateSubMatrix_MPIAIJ_SameRowDist() requires iscol_local be sorted; it can have duplicate indices */
3401           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, iscol_local, MAT_INITIAL_MATRIX, newmat));
3402           PetscFunctionReturn(PETSC_SUCCESS);
3403         }
3404       } else { /* call == MAT_REUSE_MATRIX */
3405         IS iscol_sub;
3406         PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3407         if (iscol_sub) {
3408           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, NULL, call, newmat));
3409           PetscFunctionReturn(PETSC_SUCCESS);
3410         }
3411       }
3412     }
3413   }
3414 
3415   /* General case: iscol -> iscol_local which has global size of iscol */
3416   if (call == MAT_REUSE_MATRIX) {
3417     PetscCall(PetscObjectQuery((PetscObject)*newmat, "ISAllGather", (PetscObject *)&iscol_local));
3418     PetscCheck(iscol_local, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3419   } else {
3420     if (!iscol_local) PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3421   }
3422 
3423   PetscCall(ISGetLocalSize(iscol, &csize));
3424   PetscCall(MatCreateSubMatrix_MPIAIJ_nonscalable(mat, isrow, iscol_local, csize, call, newmat));
3425 
3426   if (call == MAT_INITIAL_MATRIX) {
3427     PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3428     PetscCall(ISDestroy(&iscol_local));
3429   }
3430   PetscFunctionReturn(PETSC_SUCCESS);
3431 }
3432 
3433 /*@C
3434   MatCreateMPIAIJWithSeqAIJ - creates a `MATMPIAIJ` matrix using `MATSEQAIJ` matrices that contain the "diagonal"
3435   and "off-diagonal" part of the matrix in CSR format.
3436 
3437   Collective
3438 
3439   Input Parameters:
3440 + comm   - MPI communicator
3441 . A      - "diagonal" portion of matrix
3442 . B      - "off-diagonal" portion of matrix, may have empty columns, will be destroyed by this routine
3443 - garray - global index of `B` columns
3444 
3445   Output Parameter:
3446 . mat - the matrix, with input `A` as its local diagonal matrix
3447 
3448   Level: advanced
3449 
3450   Notes:
3451   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix.
3452 
3453   `A` becomes part of output mat, `B` is destroyed by this routine. The user cannot use `A` and `B` anymore.
3454 
3455 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateMPIAIJWithSplitArrays()`
3456 @*/
3457 PetscErrorCode MatCreateMPIAIJWithSeqAIJ(MPI_Comm comm, Mat A, Mat B, const PetscInt garray[], Mat *mat)
3458 {
3459   Mat_MPIAIJ        *maij;
3460   Mat_SeqAIJ        *b  = (Mat_SeqAIJ *)B->data, *bnew;
3461   PetscInt          *oi = b->i, *oj = b->j, i, nz, col;
3462   const PetscScalar *oa;
3463   Mat                Bnew;
3464   PetscInt           m, n, N;
3465   MatType            mpi_mat_type;
3466 
3467   PetscFunctionBegin;
3468   PetscCall(MatCreate(comm, mat));
3469   PetscCall(MatGetSize(A, &m, &n));
3470   PetscCheck(m == B->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Am %" PetscInt_FMT " != Bm %" PetscInt_FMT, m, B->rmap->N);
3471   PetscCheck(PetscAbs(A->rmap->bs) == PetscAbs(B->rmap->bs), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "A row bs %" PetscInt_FMT " != B row bs %" PetscInt_FMT, A->rmap->bs, B->rmap->bs);
3472   /* remove check below; When B is created using iscol_o from ISGetSeqIS_SameColDist_Private(), its bs may not be same as A */
3473   /* PetscCheck(A->cmap->bs == B->cmap->bs,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"A column bs %" PetscInt_FMT " != B column bs %" PetscInt_FMT,A->cmap->bs,B->cmap->bs); */
3474 
3475   /* Get global columns of mat */
3476   PetscCall(MPIU_Allreduce(&n, &N, 1, MPIU_INT, MPI_SUM, comm));
3477 
3478   PetscCall(MatSetSizes(*mat, m, n, PETSC_DECIDE, N));
3479   /* Determine the type of MPI matrix that should be created from the type of matrix A, which holds the "diagonal" portion. */
3480   PetscCall(MatGetMPIMatType_Private(A, &mpi_mat_type));
3481   PetscCall(MatSetType(*mat, mpi_mat_type));
3482 
3483   if (A->rmap->bs > 1 || A->cmap->bs > 1) PetscCall(MatSetBlockSizes(*mat, A->rmap->bs, A->cmap->bs));
3484   maij = (Mat_MPIAIJ *)(*mat)->data;
3485 
3486   (*mat)->preallocated = PETSC_TRUE;
3487 
3488   PetscCall(PetscLayoutSetUp((*mat)->rmap));
3489   PetscCall(PetscLayoutSetUp((*mat)->cmap));
3490 
3491   /* Set A as diagonal portion of *mat */
3492   maij->A = A;
3493 
3494   nz = oi[m];
3495   for (i = 0; i < nz; i++) {
3496     col   = oj[i];
3497     oj[i] = garray[col];
3498   }
3499 
3500   /* Set Bnew as off-diagonal portion of *mat */
3501   PetscCall(MatSeqAIJGetArrayRead(B, &oa));
3502   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, N, oi, oj, (PetscScalar *)oa, &Bnew));
3503   PetscCall(MatSeqAIJRestoreArrayRead(B, &oa));
3504   bnew        = (Mat_SeqAIJ *)Bnew->data;
3505   bnew->maxnz = b->maxnz; /* allocated nonzeros of B */
3506   maij->B     = Bnew;
3507 
3508   PetscCheck(B->rmap->N == Bnew->rmap->N, PETSC_COMM_SELF, PETSC_ERR_PLIB, "BN %" PetscInt_FMT " != BnewN %" PetscInt_FMT, B->rmap->N, Bnew->rmap->N);
3509 
3510   b->singlemalloc = PETSC_FALSE; /* B arrays are shared by Bnew */
3511   b->free_a       = PETSC_FALSE;
3512   b->free_ij      = PETSC_FALSE;
3513   PetscCall(MatDestroy(&B));
3514 
3515   bnew->singlemalloc = PETSC_TRUE; /* arrays will be freed by MatDestroy(&Bnew) */
3516   bnew->free_a       = PETSC_TRUE;
3517   bnew->free_ij      = PETSC_TRUE;
3518 
3519   /* condense columns of maij->B */
3520   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
3521   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
3522   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
3523   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
3524   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3525   PetscFunctionReturn(PETSC_SUCCESS);
3526 }
3527 
3528 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, PetscBool, Mat *);
3529 
3530 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowDist(Mat mat, IS isrow, IS iscol, IS iscol_local, MatReuse call, Mat *newmat)
3531 {
3532   PetscInt        i, m, n, rstart, row, rend, nz, j, bs, cbs;
3533   PetscInt       *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3534   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)mat->data;
3535   Mat             M, Msub, B = a->B;
3536   MatScalar      *aa;
3537   Mat_SeqAIJ     *aij;
3538   PetscInt       *garray = a->garray, *colsub, Ncols;
3539   PetscInt        count, Bn = B->cmap->N, cstart = mat->cmap->rstart, cend = mat->cmap->rend;
3540   IS              iscol_sub, iscmap;
3541   const PetscInt *is_idx, *cmap;
3542   PetscBool       allcolumns = PETSC_FALSE;
3543   MPI_Comm        comm;
3544 
3545   PetscFunctionBegin;
3546   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3547   if (call == MAT_REUSE_MATRIX) {
3548     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3549     PetscCheck(iscol_sub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "SubIScol passed in was not used before, cannot reuse");
3550     PetscCall(ISGetLocalSize(iscol_sub, &count));
3551 
3552     PetscCall(PetscObjectQuery((PetscObject)*newmat, "Subcmap", (PetscObject *)&iscmap));
3553     PetscCheck(iscmap, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Subcmap passed in was not used before, cannot reuse");
3554 
3555     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Msub));
3556     PetscCheck(Msub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3557 
3558     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_REUSE_MATRIX, PETSC_FALSE, &Msub));
3559 
3560   } else { /* call == MAT_INITIAL_MATRIX) */
3561     PetscBool flg;
3562 
3563     PetscCall(ISGetLocalSize(iscol, &n));
3564     PetscCall(ISGetSize(iscol, &Ncols));
3565 
3566     /* (1) iscol -> nonscalable iscol_local */
3567     /* Check for special case: each processor gets entire matrix columns */
3568     PetscCall(ISIdentity(iscol_local, &flg));
3569     if (flg && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3570     PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3571     if (allcolumns) {
3572       iscol_sub = iscol_local;
3573       PetscCall(PetscObjectReference((PetscObject)iscol_local));
3574       PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iscmap));
3575 
3576     } else {
3577       /* (2) iscol_local -> iscol_sub and iscmap. Implementation below requires iscol_local be sorted, it can have duplicate indices */
3578       PetscInt *idx, *cmap1, k;
3579       PetscCall(PetscMalloc1(Ncols, &idx));
3580       PetscCall(PetscMalloc1(Ncols, &cmap1));
3581       PetscCall(ISGetIndices(iscol_local, &is_idx));
3582       count = 0;
3583       k     = 0;
3584       for (i = 0; i < Ncols; i++) {
3585         j = is_idx[i];
3586         if (j >= cstart && j < cend) {
3587           /* diagonal part of mat */
3588           idx[count]     = j;
3589           cmap1[count++] = i; /* column index in submat */
3590         } else if (Bn) {
3591           /* off-diagonal part of mat */
3592           if (j == garray[k]) {
3593             idx[count]     = j;
3594             cmap1[count++] = i; /* column index in submat */
3595           } else if (j > garray[k]) {
3596             while (j > garray[k] && k < Bn - 1) k++;
3597             if (j == garray[k]) {
3598               idx[count]     = j;
3599               cmap1[count++] = i; /* column index in submat */
3600             }
3601           }
3602         }
3603       }
3604       PetscCall(ISRestoreIndices(iscol_local, &is_idx));
3605 
3606       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_OWN_POINTER, &iscol_sub));
3607       PetscCall(ISGetBlockSize(iscol, &cbs));
3608       PetscCall(ISSetBlockSize(iscol_sub, cbs));
3609 
3610       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)iscol_local), count, cmap1, PETSC_OWN_POINTER, &iscmap));
3611     }
3612 
3613     /* (3) Create sequential Msub */
3614     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_INITIAL_MATRIX, allcolumns, &Msub));
3615   }
3616 
3617   PetscCall(ISGetLocalSize(iscol_sub, &count));
3618   aij = (Mat_SeqAIJ *)(Msub)->data;
3619   ii  = aij->i;
3620   PetscCall(ISGetIndices(iscmap, &cmap));
3621 
3622   /*
3623       m - number of local rows
3624       Ncols - number of columns (same on all processors)
3625       rstart - first row in new global matrix generated
3626   */
3627   PetscCall(MatGetSize(Msub, &m, NULL));
3628 
3629   if (call == MAT_INITIAL_MATRIX) {
3630     /* (4) Create parallel newmat */
3631     PetscMPIInt rank, size;
3632     PetscInt    csize;
3633 
3634     PetscCallMPI(MPI_Comm_size(comm, &size));
3635     PetscCallMPI(MPI_Comm_rank(comm, &rank));
3636 
3637     /*
3638         Determine the number of non-zeros in the diagonal and off-diagonal
3639         portions of the matrix in order to do correct preallocation
3640     */
3641 
3642     /* first get start and end of "diagonal" columns */
3643     PetscCall(ISGetLocalSize(iscol, &csize));
3644     if (csize == PETSC_DECIDE) {
3645       PetscCall(ISGetSize(isrow, &mglobal));
3646       if (mglobal == Ncols) { /* square matrix */
3647         nlocal = m;
3648       } else {
3649         nlocal = Ncols / size + ((Ncols % size) > rank);
3650       }
3651     } else {
3652       nlocal = csize;
3653     }
3654     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3655     rstart = rend - nlocal;
3656     PetscCheck(rank != size - 1 || rend == Ncols, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, Ncols);
3657 
3658     /* next, compute all the lengths */
3659     jj = aij->j;
3660     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3661     olens = dlens + m;
3662     for (i = 0; i < m; i++) {
3663       jend = ii[i + 1] - ii[i];
3664       olen = 0;
3665       dlen = 0;
3666       for (j = 0; j < jend; j++) {
3667         if (cmap[*jj] < rstart || cmap[*jj] >= rend) olen++;
3668         else dlen++;
3669         jj++;
3670       }
3671       olens[i] = olen;
3672       dlens[i] = dlen;
3673     }
3674 
3675     PetscCall(ISGetBlockSize(isrow, &bs));
3676     PetscCall(ISGetBlockSize(iscol, &cbs));
3677 
3678     PetscCall(MatCreate(comm, &M));
3679     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, Ncols));
3680     PetscCall(MatSetBlockSizes(M, bs, cbs));
3681     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3682     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3683     PetscCall(PetscFree(dlens));
3684 
3685   } else { /* call == MAT_REUSE_MATRIX */
3686     M = *newmat;
3687     PetscCall(MatGetLocalSize(M, &i, NULL));
3688     PetscCheck(i == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3689     PetscCall(MatZeroEntries(M));
3690     /*
3691          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3692        rather than the slower MatSetValues().
3693     */
3694     M->was_assembled = PETSC_TRUE;
3695     M->assembled     = PETSC_FALSE;
3696   }
3697 
3698   /* (5) Set values of Msub to *newmat */
3699   PetscCall(PetscMalloc1(count, &colsub));
3700   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
3701 
3702   jj = aij->j;
3703   PetscCall(MatSeqAIJGetArrayRead(Msub, (const PetscScalar **)&aa));
3704   for (i = 0; i < m; i++) {
3705     row = rstart + i;
3706     nz  = ii[i + 1] - ii[i];
3707     for (j = 0; j < nz; j++) colsub[j] = cmap[jj[j]];
3708     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, colsub, aa, INSERT_VALUES));
3709     jj += nz;
3710     aa += nz;
3711   }
3712   PetscCall(MatSeqAIJRestoreArrayRead(Msub, (const PetscScalar **)&aa));
3713   PetscCall(ISRestoreIndices(iscmap, &cmap));
3714 
3715   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3716   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3717 
3718   PetscCall(PetscFree(colsub));
3719 
3720   /* save Msub, iscol_sub and iscmap used in processor for next request */
3721   if (call == MAT_INITIAL_MATRIX) {
3722     *newmat = M;
3723     PetscCall(PetscObjectCompose((PetscObject)(*newmat), "SubMatrix", (PetscObject)Msub));
3724     PetscCall(MatDestroy(&Msub));
3725 
3726     PetscCall(PetscObjectCompose((PetscObject)(*newmat), "SubIScol", (PetscObject)iscol_sub));
3727     PetscCall(ISDestroy(&iscol_sub));
3728 
3729     PetscCall(PetscObjectCompose((PetscObject)(*newmat), "Subcmap", (PetscObject)iscmap));
3730     PetscCall(ISDestroy(&iscmap));
3731 
3732     if (iscol_local) {
3733       PetscCall(PetscObjectCompose((PetscObject)(*newmat), "ISAllGather", (PetscObject)iscol_local));
3734       PetscCall(ISDestroy(&iscol_local));
3735     }
3736   }
3737   PetscFunctionReturn(PETSC_SUCCESS);
3738 }
3739 
3740 /*
3741     Not great since it makes two copies of the submatrix, first an SeqAIJ
3742   in local and then by concatenating the local matrices the end result.
3743   Writing it directly would be much like MatCreateSubMatrices_MPIAIJ()
3744 
3745   This requires a sequential iscol with all indices.
3746 */
3747 PetscErrorCode MatCreateSubMatrix_MPIAIJ_nonscalable(Mat mat, IS isrow, IS iscol, PetscInt csize, MatReuse call, Mat *newmat)
3748 {
3749   PetscMPIInt rank, size;
3750   PetscInt    i, m, n, rstart, row, rend, nz, *cwork, j, bs, cbs;
3751   PetscInt   *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3752   Mat         M, Mreuse;
3753   MatScalar  *aa, *vwork;
3754   MPI_Comm    comm;
3755   Mat_SeqAIJ *aij;
3756   PetscBool   colflag, allcolumns = PETSC_FALSE;
3757 
3758   PetscFunctionBegin;
3759   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3760   PetscCallMPI(MPI_Comm_rank(comm, &rank));
3761   PetscCallMPI(MPI_Comm_size(comm, &size));
3762 
3763   /* Check for special case: each processor gets entire matrix columns */
3764   PetscCall(ISIdentity(iscol, &colflag));
3765   PetscCall(ISGetLocalSize(iscol, &n));
3766   if (colflag && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3767   PetscCall(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3768 
3769   if (call == MAT_REUSE_MATRIX) {
3770     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Mreuse));
3771     PetscCheck(Mreuse, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3772     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_REUSE_MATRIX, allcolumns, &Mreuse));
3773   } else {
3774     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_INITIAL_MATRIX, allcolumns, &Mreuse));
3775   }
3776 
3777   /*
3778       m - number of local rows
3779       n - number of columns (same on all processors)
3780       rstart - first row in new global matrix generated
3781   */
3782   PetscCall(MatGetSize(Mreuse, &m, &n));
3783   PetscCall(MatGetBlockSizes(Mreuse, &bs, &cbs));
3784   if (call == MAT_INITIAL_MATRIX) {
3785     aij = (Mat_SeqAIJ *)(Mreuse)->data;
3786     ii  = aij->i;
3787     jj  = aij->j;
3788 
3789     /*
3790         Determine the number of non-zeros in the diagonal and off-diagonal
3791         portions of the matrix in order to do correct preallocation
3792     */
3793 
3794     /* first get start and end of "diagonal" columns */
3795     if (csize == PETSC_DECIDE) {
3796       PetscCall(ISGetSize(isrow, &mglobal));
3797       if (mglobal == n) { /* square matrix */
3798         nlocal = m;
3799       } else {
3800         nlocal = n / size + ((n % size) > rank);
3801       }
3802     } else {
3803       nlocal = csize;
3804     }
3805     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3806     rstart = rend - nlocal;
3807     PetscCheck(rank != size - 1 || rend == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, n);
3808 
3809     /* next, compute all the lengths */
3810     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3811     olens = dlens + m;
3812     for (i = 0; i < m; i++) {
3813       jend = ii[i + 1] - ii[i];
3814       olen = 0;
3815       dlen = 0;
3816       for (j = 0; j < jend; j++) {
3817         if (*jj < rstart || *jj >= rend) olen++;
3818         else dlen++;
3819         jj++;
3820       }
3821       olens[i] = olen;
3822       dlens[i] = dlen;
3823     }
3824     PetscCall(MatCreate(comm, &M));
3825     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, n));
3826     PetscCall(MatSetBlockSizes(M, bs, cbs));
3827     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3828     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3829     PetscCall(PetscFree(dlens));
3830   } else {
3831     PetscInt ml, nl;
3832 
3833     M = *newmat;
3834     PetscCall(MatGetLocalSize(M, &ml, &nl));
3835     PetscCheck(ml == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3836     PetscCall(MatZeroEntries(M));
3837     /*
3838          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3839        rather than the slower MatSetValues().
3840     */
3841     M->was_assembled = PETSC_TRUE;
3842     M->assembled     = PETSC_FALSE;
3843   }
3844   PetscCall(MatGetOwnershipRange(M, &rstart, &rend));
3845   aij = (Mat_SeqAIJ *)(Mreuse)->data;
3846   ii  = aij->i;
3847   jj  = aij->j;
3848 
3849   /* trigger copy to CPU if needed */
3850   PetscCall(MatSeqAIJGetArrayRead(Mreuse, (const PetscScalar **)&aa));
3851   for (i = 0; i < m; i++) {
3852     row   = rstart + i;
3853     nz    = ii[i + 1] - ii[i];
3854     cwork = jj;
3855     jj += nz;
3856     vwork = aa;
3857     aa += nz;
3858     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, cwork, vwork, INSERT_VALUES));
3859   }
3860   PetscCall(MatSeqAIJRestoreArrayRead(Mreuse, (const PetscScalar **)&aa));
3861 
3862   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3863   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3864   *newmat = M;
3865 
3866   /* save submatrix used in processor for next request */
3867   if (call == MAT_INITIAL_MATRIX) {
3868     PetscCall(PetscObjectCompose((PetscObject)M, "SubMatrix", (PetscObject)Mreuse));
3869     PetscCall(MatDestroy(&Mreuse));
3870   }
3871   PetscFunctionReturn(PETSC_SUCCESS);
3872 }
3873 
3874 static PetscErrorCode MatMPIAIJSetPreallocationCSR_MPIAIJ(Mat B, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
3875 {
3876   PetscInt        m, cstart, cend, j, nnz, i, d, *ld;
3877   PetscInt       *d_nnz, *o_nnz, nnz_max = 0, rstart, ii;
3878   const PetscInt *JJ;
3879   PetscBool       nooffprocentries;
3880   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)B->data;
3881 
3882   PetscFunctionBegin;
3883   PetscCheck(Ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Ii[0] must be 0 it is %" PetscInt_FMT, Ii[0]);
3884 
3885   PetscCall(PetscLayoutSetUp(B->rmap));
3886   PetscCall(PetscLayoutSetUp(B->cmap));
3887   m      = B->rmap->n;
3888   cstart = B->cmap->rstart;
3889   cend   = B->cmap->rend;
3890   rstart = B->rmap->rstart;
3891 
3892   PetscCall(PetscCalloc2(m, &d_nnz, m, &o_nnz));
3893 
3894   if (PetscDefined(USE_DEBUG)) {
3895     for (i = 0; i < m; i++) {
3896       nnz = Ii[i + 1] - Ii[i];
3897       JJ  = J ? J + Ii[i] : NULL;
3898       PetscCheck(nnz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative %" PetscInt_FMT " number of columns", i, nnz);
3899       PetscCheck(!nnz || !(JJ[0] < 0), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " starts with negative column index %" PetscInt_FMT, i, JJ[0]);
3900       PetscCheck(!nnz || !(JJ[nnz - 1] >= B->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " ends with too large a column index %" PetscInt_FMT " (max allowed %" PetscInt_FMT ")", i, JJ[nnz - 1], B->cmap->N);
3901     }
3902   }
3903 
3904   for (i = 0; i < m; i++) {
3905     nnz     = Ii[i + 1] - Ii[i];
3906     JJ      = J ? J + Ii[i] : NULL;
3907     nnz_max = PetscMax(nnz_max, nnz);
3908     d       = 0;
3909     for (j = 0; j < nnz; j++) {
3910       if (cstart <= JJ[j] && JJ[j] < cend) d++;
3911     }
3912     d_nnz[i] = d;
3913     o_nnz[i] = nnz - d;
3914   }
3915   PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
3916   PetscCall(PetscFree2(d_nnz, o_nnz));
3917 
3918   for (i = 0; i < m; i++) {
3919     ii = i + rstart;
3920     PetscCall(MatSetValues_MPIAIJ(B, 1, &ii, Ii[i + 1] - Ii[i], J ? J + Ii[i] : NULL, v ? v + Ii[i] : NULL, INSERT_VALUES));
3921   }
3922   nooffprocentries    = B->nooffprocentries;
3923   B->nooffprocentries = PETSC_TRUE;
3924   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3925   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3926   B->nooffprocentries = nooffprocentries;
3927 
3928   /* count number of entries below block diagonal */
3929   PetscCall(PetscFree(Aij->ld));
3930   PetscCall(PetscCalloc1(m, &ld));
3931   Aij->ld = ld;
3932   for (i = 0; i < m; i++) {
3933     nnz = Ii[i + 1] - Ii[i];
3934     j   = 0;
3935     while (j < nnz && J[j] < cstart) j++;
3936     ld[i] = j;
3937     if (J) J += nnz;
3938   }
3939 
3940   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3941   PetscFunctionReturn(PETSC_SUCCESS);
3942 }
3943 
3944 /*@
3945   MatMPIAIJSetPreallocationCSR - Allocates memory for a sparse parallel matrix in `MATAIJ` format
3946   (the default parallel PETSc format).
3947 
3948   Collective
3949 
3950   Input Parameters:
3951 + B - the matrix
3952 . i - the indices into j for the start of each local row (starts with zero)
3953 . j - the column indices for each local row (starts with zero)
3954 - v - optional values in the matrix
3955 
3956   Level: developer
3957 
3958   Notes:
3959   The `i`, `j`, and `v` arrays ARE copied by this routine into the internal format used by PETSc;
3960   thus you CANNOT change the matrix entries by changing the values of `v` after you have
3961   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
3962 
3963   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
3964 
3965   A convenience routine for this functionality is `MatCreateMPIAIJWithArrays()`.
3966 
3967   You can update the matrix with new numerical values using `MatUpdateMPIAIJWithArrays()` after this call if the column indices in `j` are sorted.
3968 
3969   If you do **not** use `MatUpdateMPIAIJWithArrays()`, the column indices in `j` do not need to be sorted. If you will use
3970   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
3971 
3972   The format which is used for the sparse matrix input, is equivalent to a
3973   row-major ordering.. i.e for the following matrix, the input data expected is
3974   as shown
3975 .vb
3976         1 0 0
3977         2 0 3     P0
3978        -------
3979         4 5 6     P1
3980 
3981      Process0 [P0] rows_owned=[0,1]
3982         i =  {0,1,3}  [size = nrow+1  = 2+1]
3983         j =  {0,0,2}  [size = 3]
3984         v =  {1,2,3}  [size = 3]
3985 
3986      Process1 [P1] rows_owned=[2]
3987         i =  {0,3}    [size = nrow+1  = 1+1]
3988         j =  {0,1,2}  [size = 3]
3989         v =  {4,5,6}  [size = 3]
3990 .ve
3991 
3992 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatCreateAIJ()`,
3993           `MatCreateSeqAIJWithArrays()`, `MatCreateMPIAIJWithSplitArrays()`, `MatCreateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3994 @*/
3995 PetscErrorCode MatMPIAIJSetPreallocationCSR(Mat B, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
3996 {
3997   PetscFunctionBegin;
3998   PetscTryMethod(B, "MatMPIAIJSetPreallocationCSR_C", (Mat, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, i, j, v));
3999   PetscFunctionReturn(PETSC_SUCCESS);
4000 }
4001 
4002 /*@C
4003   MatMPIAIJSetPreallocation - Preallocates memory for a sparse parallel matrix in `MATMPIAIJ` format
4004   (the default parallel PETSc format).  For good matrix assembly performance
4005   the user should preallocate the matrix storage by setting the parameters
4006   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4007 
4008   Collective
4009 
4010   Input Parameters:
4011 + B     - the matrix
4012 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4013            (same value is used for all local rows)
4014 . d_nnz - array containing the number of nonzeros in the various rows of the
4015            DIAGONAL portion of the local submatrix (possibly different for each row)
4016            or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `d_nz` is used to specify the nonzero structure.
4017            The size of this array is equal to the number of local rows, i.e 'm'.
4018            For matrices that will be factored, you must leave room for (and set)
4019            the diagonal entry even if it is zero.
4020 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4021            submatrix (same value is used for all local rows).
4022 - o_nnz - array containing the number of nonzeros in the various rows of the
4023            OFF-DIAGONAL portion of the local submatrix (possibly different for
4024            each row) or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `o_nz` is used to specify the nonzero
4025            structure. The size of this array is equal to the number
4026            of local rows, i.e 'm'.
4027 
4028   Example Usage:
4029   Consider the following 8x8 matrix with 34 non-zero values, that is
4030   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4031   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4032   as follows
4033 
4034 .vb
4035             1  2  0  |  0  3  0  |  0  4
4036     Proc0   0  5  6  |  7  0  0  |  8  0
4037             9  0 10  | 11  0  0  | 12  0
4038     -------------------------------------
4039            13  0 14  | 15 16 17  |  0  0
4040     Proc1   0 18  0  | 19 20 21  |  0  0
4041             0  0  0  | 22 23  0  | 24  0
4042     -------------------------------------
4043     Proc2  25 26 27  |  0  0 28  | 29  0
4044            30  0  0  | 31 32 33  |  0 34
4045 .ve
4046 
4047   This can be represented as a collection of submatrices as
4048 .vb
4049       A B C
4050       D E F
4051       G H I
4052 .ve
4053 
4054   Where the submatrices A,B,C are owned by proc0, D,E,F are
4055   owned by proc1, G,H,I are owned by proc2.
4056 
4057   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4058   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4059   The 'M','N' parameters are 8,8, and have the same values on all procs.
4060 
4061   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4062   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4063   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4064   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4065   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4066   matrix, ans [DF] as another `MATSEQAIJ` matrix.
4067 
4068   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4069   allocated for every row of the local diagonal submatrix, and `o_nz`
4070   storage locations are allocated for every row of the OFF-DIAGONAL submat.
4071   One way to choose `d_nz` and `o_nz` is to use the max nonzerors per local
4072   rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4073   In this case, the values of `d_nz`, `o_nz` are
4074 .vb
4075      proc0  dnz = 2, o_nz = 2
4076      proc1  dnz = 3, o_nz = 2
4077      proc2  dnz = 1, o_nz = 4
4078 .ve
4079   We are allocating `m`*(`d_nz`+`o_nz`) storage locations for every proc. This
4080   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4081   for proc3. i.e we are using 12+15+10=37 storage locations to store
4082   34 values.
4083 
4084   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4085   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4086   In the above case the values for `d_nnz`, `o_nnz` are
4087 .vb
4088      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4089      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4090      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4091 .ve
4092   Here the space allocated is sum of all the above values i.e 34, and
4093   hence pre-allocation is perfect.
4094 
4095   Level: intermediate
4096 
4097   Notes:
4098   If the *_nnz parameter is given then the *_nz parameter is ignored
4099 
4100   The `MATAIJ` format, also called compressed row storage (CSR), is compatible with standard Fortran
4101   storage.  The stored row and column indices begin with zero.
4102   See [Sparse Matrices](sec_matsparse) for details.
4103 
4104   The parallel matrix is partitioned such that the first m0 rows belong to
4105   process 0, the next m1 rows belong to process 1, the next m2 rows belong
4106   to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
4107 
4108   The DIAGONAL portion of the local submatrix of a processor can be defined
4109   as the submatrix which is obtained by extraction the part corresponding to
4110   the rows r1-r2 and columns c1-c2 of the global matrix, where r1 is the
4111   first row that belongs to the processor, r2 is the last row belonging to
4112   the this processor, and c1-c2 is range of indices of the local part of a
4113   vector suitable for applying the matrix to.  This is an mxn matrix.  In the
4114   common case of a square matrix, the row and column ranges are the same and
4115   the DIAGONAL part is also square. The remaining portion of the local
4116   submatrix (mxN) constitute the OFF-DIAGONAL portion.
4117 
4118   If `o_nnz` and `d_nnz` are specified, then `o_nz` and `d_nz` are ignored.
4119 
4120   You can call `MatGetInfo()` to get information on how effective the preallocation was;
4121   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
4122   You can also run with the option `-info` and look for messages with the string
4123   malloc in them to see if additional memory allocation was needed.
4124 
4125 .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MATMPIAIJ`, `MATAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateAIJ()`, `MatMPIAIJSetPreallocationCSR()`,
4126           `MatGetInfo()`, `PetscSplitOwnership()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4127 @*/
4128 PetscErrorCode MatMPIAIJSetPreallocation(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
4129 {
4130   PetscFunctionBegin;
4131   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
4132   PetscValidType(B, 1);
4133   PetscTryMethod(B, "MatMPIAIJSetPreallocation_C", (Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[]), (B, d_nz, d_nnz, o_nz, o_nnz));
4134   PetscFunctionReturn(PETSC_SUCCESS);
4135 }
4136 
4137 /*@
4138   MatCreateMPIAIJWithArrays - creates a `MATMPIAIJ` matrix using arrays that contain in standard
4139   CSR format for the local rows.
4140 
4141   Collective
4142 
4143   Input Parameters:
4144 + comm - MPI communicator
4145 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
4146 . n    - This value should be the same as the local size used in creating the
4147        x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4148        calculated if N is given) For square matrices n is almost always m.
4149 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4150 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4151 . i    - row indices; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
4152 . j    - column indices
4153 - a    - optional matrix values
4154 
4155   Output Parameter:
4156 . mat - the matrix
4157 
4158   Level: intermediate
4159 
4160   Notes:
4161   The `i`, `j`, and `a` arrays ARE copied by this routine into the internal format used by PETSc;
4162   thus you CANNOT change the matrix entries by changing the values of a[] after you have
4163   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
4164 
4165   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
4166 
4167   Once you have created the matrix you can update it with new numerical values using `MatUpdateMPIAIJWithArrays()`
4168 
4169   If you do **not** use `MatUpdateMPIAIJWithArrays()`, the column indices in `j` do not need to be sorted. If you will use
4170   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
4171 
4172   The format which is used for the sparse matrix input, is equivalent to a
4173   row-major ordering.. i.e for the following matrix, the input data expected is
4174   as shown
4175 .vb
4176         1 0 0
4177         2 0 3     P0
4178        -------
4179         4 5 6     P1
4180 
4181      Process0 [P0] rows_owned=[0,1]
4182         i =  {0,1,3}  [size = nrow+1  = 2+1]
4183         j =  {0,0,2}  [size = 3]
4184         v =  {1,2,3}  [size = 3]
4185 
4186      Process1 [P1] rows_owned=[2]
4187         i =  {0,3}    [size = nrow+1  = 1+1]
4188         j =  {0,1,2}  [size = 3]
4189         v =  {4,5,6}  [size = 3]
4190 .ve
4191 
4192 .seealso: [](ch_matrices), `Mat`, `MATMPIAIK`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4193           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4194 @*/
4195 PetscErrorCode MatCreateMPIAIJWithArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt i[], const PetscInt j[], const PetscScalar a[], Mat *mat)
4196 {
4197   PetscFunctionBegin;
4198   PetscCheck(!i || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4199   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4200   PetscCall(MatCreate(comm, mat));
4201   PetscCall(MatSetSizes(*mat, m, n, M, N));
4202   /* PetscCall(MatSetBlockSizes(M,bs,cbs)); */
4203   PetscCall(MatSetType(*mat, MATMPIAIJ));
4204   PetscCall(MatMPIAIJSetPreallocationCSR(*mat, i, j, a));
4205   PetscFunctionReturn(PETSC_SUCCESS);
4206 }
4207 
4208 /*@
4209   MatUpdateMPIAIJWithArrays - updates a `MATMPIAIJ` matrix using arrays that contain in standard
4210   CSR format for the local rows. Only the numerical values are updated the other arrays must be identical to what was passed
4211   from `MatCreateMPIAIJWithArrays()`
4212 
4213   Deprecated: Use `MatUpdateMPIAIJWithArray()`
4214 
4215   Collective
4216 
4217   Input Parameters:
4218 + mat - the matrix
4219 . m   - number of local rows (Cannot be `PETSC_DECIDE`)
4220 . n   - This value should be the same as the local size used in creating the
4221        x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4222        calculated if N is given) For square matrices n is almost always m.
4223 . M   - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4224 . N   - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4225 . Ii  - row indices; that is Ii[0] = 0, Ii[row] = Ii[row-1] + number of elements in that row of the matrix
4226 . J   - column indices
4227 - v   - matrix values
4228 
4229   Level: deprecated
4230 
4231 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4232           `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4233 @*/
4234 PetscErrorCode MatUpdateMPIAIJWithArrays(Mat mat, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
4235 {
4236   PetscInt        nnz, i;
4237   PetscBool       nooffprocentries;
4238   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4239   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4240   PetscScalar    *ad, *ao;
4241   PetscInt        ldi, Iii, md;
4242   const PetscInt *Adi = Ad->i;
4243   PetscInt       *ld  = Aij->ld;
4244 
4245   PetscFunctionBegin;
4246   PetscCheck(Ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4247   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4248   PetscCheck(m == mat->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of rows cannot change from call to MatUpdateMPIAIJWithArrays()");
4249   PetscCheck(n == mat->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of columns cannot change from call to MatUpdateMPIAIJWithArrays()");
4250 
4251   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4252   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4253 
4254   for (i = 0; i < m; i++) {
4255     if (PetscDefined(USE_DEBUG)) {
4256       for (PetscInt j = Ii[i] + 1; j < Ii[i + 1]; ++j) {
4257         PetscCheck(J[j] >= J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is not sorted", j - Ii[i], J[j], i);
4258         PetscCheck(J[j] != J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is identical to previous entry", j - Ii[i], J[j], i);
4259       }
4260     }
4261     nnz = Ii[i + 1] - Ii[i];
4262     Iii = Ii[i];
4263     ldi = ld[i];
4264     md  = Adi[i + 1] - Adi[i];
4265     PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4266     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4267     PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4268     ad += md;
4269     ao += nnz - md;
4270   }
4271   nooffprocentries      = mat->nooffprocentries;
4272   mat->nooffprocentries = PETSC_TRUE;
4273   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4274   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4275   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4276   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4277   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4278   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4279   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4280   mat->nooffprocentries = nooffprocentries;
4281   PetscFunctionReturn(PETSC_SUCCESS);
4282 }
4283 
4284 /*@
4285   MatUpdateMPIAIJWithArray - updates an `MATMPIAIJ` matrix using an array that contains the nonzero values
4286 
4287   Collective
4288 
4289   Input Parameters:
4290 + mat - the matrix
4291 - v   - matrix values, stored by row
4292 
4293   Level: intermediate
4294 
4295   Notes:
4296   The matrix must have been obtained with `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()`
4297 
4298   The column indices in the call to `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()` must have been sorted for this call to work correctly
4299 
4300 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4301           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4302 @*/
4303 PetscErrorCode MatUpdateMPIAIJWithArray(Mat mat, const PetscScalar v[])
4304 {
4305   PetscInt        nnz, i, m;
4306   PetscBool       nooffprocentries;
4307   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4308   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4309   Mat_SeqAIJ     *Ao  = (Mat_SeqAIJ *)Aij->B->data;
4310   PetscScalar    *ad, *ao;
4311   const PetscInt *Adi = Ad->i, *Adj = Ao->i;
4312   PetscInt        ldi, Iii, md;
4313   PetscInt       *ld = Aij->ld;
4314 
4315   PetscFunctionBegin;
4316   m = mat->rmap->n;
4317 
4318   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4319   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4320   Iii = 0;
4321   for (i = 0; i < m; i++) {
4322     nnz = Adi[i + 1] - Adi[i] + Adj[i + 1] - Adj[i];
4323     ldi = ld[i];
4324     md  = Adi[i + 1] - Adi[i];
4325     PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4326     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4327     PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4328     ad += md;
4329     ao += nnz - md;
4330     Iii += nnz;
4331   }
4332   nooffprocentries      = mat->nooffprocentries;
4333   mat->nooffprocentries = PETSC_TRUE;
4334   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4335   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4336   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4337   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4338   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4339   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4340   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4341   mat->nooffprocentries = nooffprocentries;
4342   PetscFunctionReturn(PETSC_SUCCESS);
4343 }
4344 
4345 /*@C
4346   MatCreateAIJ - Creates a sparse parallel matrix in `MATAIJ` format
4347   (the default parallel PETSc format).  For good matrix assembly performance
4348   the user should preallocate the matrix storage by setting the parameters
4349   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4350 
4351   Collective
4352 
4353   Input Parameters:
4354 + comm  - MPI communicator
4355 . m     - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
4356            This value should be the same as the local size used in creating the
4357            y vector for the matrix-vector product y = Ax.
4358 . n     - This value should be the same as the local size used in creating the
4359        x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4360        calculated if N is given) For square matrices n is almost always m.
4361 . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4362 . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4363 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4364            (same value is used for all local rows)
4365 . d_nnz - array containing the number of nonzeros in the various rows of the
4366            DIAGONAL portion of the local submatrix (possibly different for each row)
4367            or `NULL`, if `d_nz` is used to specify the nonzero structure.
4368            The size of this array is equal to the number of local rows, i.e 'm'.
4369 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4370            submatrix (same value is used for all local rows).
4371 - o_nnz - array containing the number of nonzeros in the various rows of the
4372            OFF-DIAGONAL portion of the local submatrix (possibly different for
4373            each row) or `NULL`, if `o_nz` is used to specify the nonzero
4374            structure. The size of this array is equal to the number
4375            of local rows, i.e 'm'.
4376 
4377   Output Parameter:
4378 . A - the matrix
4379 
4380   Options Database Keys:
4381 + -mat_no_inode                     - Do not use inodes
4382 . -mat_inode_limit <limit>          - Sets inode limit (max limit=5)
4383 - -matmult_vecscatter_view <viewer> - View the vecscatter (i.e., communication pattern) used in `MatMult()` of sparse parallel matrices.
4384         See viewer types in manual of `MatView()`. Of them, ascii_matlab, draw or binary cause the vecscatter be viewed as a matrix.
4385         Entry (i,j) is the size of message (in bytes) rank i sends to rank j in one `MatMult()` call.
4386 
4387   Level: intermediate
4388 
4389   Notes:
4390   It is recommended that one use `MatCreateFromOptions()` or the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
4391   MatXXXXSetPreallocation() paradigm instead of this routine directly.
4392   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
4393 
4394   If the *_nnz parameter is given then the *_nz parameter is ignored
4395 
4396   The `m`,`n`,`M`,`N` parameters specify the size of the matrix, and its partitioning across
4397   processors, while `d_nz`,`d_nnz`,`o_nz`,`o_nnz` parameters specify the approximate
4398   storage requirements for this matrix.
4399 
4400   If `PETSC_DECIDE` or  `PETSC_DETERMINE` is used for a particular argument on one
4401   processor than it must be used on all processors that share the object for
4402   that argument.
4403 
4404   The user MUST specify either the local or global matrix dimensions
4405   (possibly both).
4406 
4407   The parallel matrix is partitioned across processors such that the
4408   first m0 rows belong to process 0, the next m1 rows belong to
4409   process 1, the next m2 rows belong to process 2 etc.. where
4410   m0,m1,m2,.. are the input parameter 'm'. i.e each processor stores
4411   values corresponding to [m x N] submatrix.
4412 
4413   The columns are logically partitioned with the n0 columns belonging
4414   to 0th partition, the next n1 columns belonging to the next
4415   partition etc.. where n0,n1,n2... are the input parameter 'n'.
4416 
4417   The DIAGONAL portion of the local submatrix on any given processor
4418   is the submatrix corresponding to the rows and columns m,n
4419   corresponding to the given processor. i.e diagonal matrix on
4420   process 0 is [m0 x n0], diagonal matrix on process 1 is [m1 x n1]
4421   etc. The remaining portion of the local submatrix [m x (N-n)]
4422   constitute the OFF-DIAGONAL portion. The example below better
4423   illustrates this concept.
4424 
4425   For a square global matrix we define each processor's diagonal portion
4426   to be its local rows and the corresponding columns (a square submatrix);
4427   each processor's off-diagonal portion encompasses the remainder of the
4428   local matrix (a rectangular submatrix).
4429 
4430   If `o_nnz`, `d_nnz` are specified, then `o_nz`, and `d_nz` are ignored.
4431 
4432   When calling this routine with a single process communicator, a matrix of
4433   type `MATSEQAIJ` is returned.  If a matrix of type `MATMPIAIJ` is desired for this
4434   type of communicator, use the construction mechanism
4435 .vb
4436   MatCreate(..., &A);
4437   MatSetType(A, MATMPIAIJ);
4438   MatSetSizes(A, m, n, M, N);
4439   MatMPIAIJSetPreallocation(A, ...);
4440 .ve
4441 
4442   By default, this format uses inodes (identical nodes) when possible.
4443   We search for consecutive rows with the same nonzero structure, thereby
4444   reusing matrix information to achieve increased efficiency.
4445 
4446   Example Usage:
4447   Consider the following 8x8 matrix with 34 non-zero values, that is
4448   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4449   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4450   as follows
4451 
4452 .vb
4453             1  2  0  |  0  3  0  |  0  4
4454     Proc0   0  5  6  |  7  0  0  |  8  0
4455             9  0 10  | 11  0  0  | 12  0
4456     -------------------------------------
4457            13  0 14  | 15 16 17  |  0  0
4458     Proc1   0 18  0  | 19 20 21  |  0  0
4459             0  0  0  | 22 23  0  | 24  0
4460     -------------------------------------
4461     Proc2  25 26 27  |  0  0 28  | 29  0
4462            30  0  0  | 31 32 33  |  0 34
4463 .ve
4464 
4465   This can be represented as a collection of submatrices as
4466 
4467 .vb
4468       A B C
4469       D E F
4470       G H I
4471 .ve
4472 
4473   Where the submatrices A,B,C are owned by proc0, D,E,F are
4474   owned by proc1, G,H,I are owned by proc2.
4475 
4476   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4477   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4478   The 'M','N' parameters are 8,8, and have the same values on all procs.
4479 
4480   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4481   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4482   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4483   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4484   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4485   matrix, ans [DF] as another SeqAIJ matrix.
4486 
4487   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4488   allocated for every row of the local diagonal submatrix, and `o_nz`
4489   storage locations are allocated for every row of the OFF-DIAGONAL submat.
4490   One way to choose `d_nz` and `o_nz` is to use the max nonzerors per local
4491   rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4492   In this case, the values of `d_nz`,`o_nz` are
4493 .vb
4494      proc0  dnz = 2, o_nz = 2
4495      proc1  dnz = 3, o_nz = 2
4496      proc2  dnz = 1, o_nz = 4
4497 .ve
4498   We are allocating m*(`d_nz`+`o_nz`) storage locations for every proc. This
4499   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4500   for proc3. i.e we are using 12+15+10=37 storage locations to store
4501   34 values.
4502 
4503   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4504   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4505   In the above case the values for d_nnz,o_nnz are
4506 .vb
4507      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4508      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4509      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4510 .ve
4511   Here the space allocated is sum of all the above values i.e 34, and
4512   hence pre-allocation is perfect.
4513 
4514 .seealso: [](ch_matrices), `Mat`, [Sparse Matrix Creation](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4515           `MATMPIAIJ`, `MatCreateMPIAIJWithArrays()`
4516 @*/
4517 PetscErrorCode MatCreateAIJ(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
4518 {
4519   PetscMPIInt size;
4520 
4521   PetscFunctionBegin;
4522   PetscCall(MatCreate(comm, A));
4523   PetscCall(MatSetSizes(*A, m, n, M, N));
4524   PetscCallMPI(MPI_Comm_size(comm, &size));
4525   if (size > 1) {
4526     PetscCall(MatSetType(*A, MATMPIAIJ));
4527     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
4528   } else {
4529     PetscCall(MatSetType(*A, MATSEQAIJ));
4530     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
4531   }
4532   PetscFunctionReturn(PETSC_SUCCESS);
4533 }
4534 
4535 /*MC
4536     MatMPIAIJGetSeqAIJF90 - Returns the local pieces of this distributed matrix
4537 
4538     Synopsis:
4539     MatMPIAIJGetSeqAIJF90(Mat A, Mat Ad, Mat Ao, {PetscInt, pointer :: colmap(:)},integer ierr)
4540 
4541     Not Collective
4542 
4543     Input Parameter:
4544 .   A - the `MATMPIAIJ` matrix
4545 
4546     Output Parameters:
4547 +   Ad - the diagonal portion of the matrix
4548 .   Ao - the off-diagonal portion of the matrix
4549 .   colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4550 -   ierr - error code
4551 
4552      Level: advanced
4553 
4554     Note:
4555     Use  `MatMPIAIJRestoreSeqAIJF90()` when you no longer need access to the matrices and `colmap`
4556 
4557 .seealso: [](ch_matrices), `Mat`, [](sec_fortranarrays), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJ()`, `MatMPIAIJRestoreSeqAIJF90()`
4558 M*/
4559 
4560 /*MC
4561     MatMPIAIJRestoreSeqAIJF90 - call after `MatMPIAIJGetSeqAIJF90()` when you no longer need access to the matrices and `colmap`
4562 
4563     Synopsis:
4564     MatMPIAIJRestoreSeqAIJF90(Mat A, Mat Ad, Mat Ao, {PetscInt, pointer :: colmap(:)},integer ierr)
4565 
4566     Not Collective
4567 
4568     Input Parameters:
4569 +   A - the `MATMPIAIJ` matrix
4570 .   Ad - the diagonal portion of the matrix
4571 .   Ao - the off-diagonal portion of the matrix
4572 .   colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4573 -   ierr - error code
4574 
4575      Level: advanced
4576 
4577 .seealso: [](ch_matrices), `Mat`, [](sec_fortranarrays), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJ()`, `MatMPIAIJGetSeqAIJF90()`
4578 M*/
4579 
4580 /*@C
4581   MatMPIAIJGetSeqAIJ - Returns the local pieces of this distributed matrix
4582 
4583   Not Collective
4584 
4585   Input Parameter:
4586 . A - The `MATMPIAIJ` matrix
4587 
4588   Output Parameters:
4589 + Ad     - The local diagonal block as a `MATSEQAIJ` matrix
4590 . Ao     - The local off-diagonal block as a `MATSEQAIJ` matrix
4591 - colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4592 
4593   Level: intermediate
4594 
4595   Note:
4596   The rows in `Ad` and `Ao` are in [0, Nr), where Nr is the number of local rows on this process. The columns
4597   in `Ad` are in [0, Nc) where Nc is the number of local columns. The columns are `Ao` are in [0, Nco), where Nco is
4598   the number of nonzero columns in the local off-diagonal piece of the matrix `A`. The array colmap maps these
4599   local column numbers to global column numbers in the original matrix.
4600 
4601   Fortran Notes:
4602   `MatMPIAIJGetSeqAIJ()` Fortran binding is deprecated (since PETSc 3.19), use `MatMPIAIJGetSeqAIJF90()`
4603 
4604 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJF90()`, `MatMPIAIJRestoreSeqAIJF90()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`, `MatCreateAIJ()`, `MATSEQAIJ`
4605 @*/
4606 PetscErrorCode MatMPIAIJGetSeqAIJ(Mat A, Mat *Ad, Mat *Ao, const PetscInt *colmap[])
4607 {
4608   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
4609   PetscBool   flg;
4610 
4611   PetscFunctionBegin;
4612   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &flg));
4613   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "This function requires a MATMPIAIJ matrix as input");
4614   if (Ad) *Ad = a->A;
4615   if (Ao) *Ao = a->B;
4616   if (colmap) *colmap = a->garray;
4617   PetscFunctionReturn(PETSC_SUCCESS);
4618 }
4619 
4620 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_MPIAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
4621 {
4622   PetscInt     m, N, i, rstart, nnz, Ii;
4623   PetscInt    *indx;
4624   PetscScalar *values;
4625   MatType      rootType;
4626 
4627   PetscFunctionBegin;
4628   PetscCall(MatGetSize(inmat, &m, &N));
4629   if (scall == MAT_INITIAL_MATRIX) { /* symbolic phase */
4630     PetscInt *dnz, *onz, sum, bs, cbs;
4631 
4632     if (n == PETSC_DECIDE) PetscCall(PetscSplitOwnership(comm, &n, &N));
4633     /* Check sum(n) = N */
4634     PetscCall(MPIU_Allreduce(&n, &sum, 1, MPIU_INT, MPI_SUM, comm));
4635     PetscCheck(sum == N, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Sum of local columns %" PetscInt_FMT " != global columns %" PetscInt_FMT, sum, N);
4636 
4637     PetscCallMPI(MPI_Scan(&m, &rstart, 1, MPIU_INT, MPI_SUM, comm));
4638     rstart -= m;
4639 
4640     MatPreallocateBegin(comm, m, n, dnz, onz);
4641     for (i = 0; i < m; i++) {
4642       PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4643       PetscCall(MatPreallocateSet(i + rstart, nnz, indx, dnz, onz));
4644       PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4645     }
4646 
4647     PetscCall(MatCreate(comm, outmat));
4648     PetscCall(MatSetSizes(*outmat, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4649     PetscCall(MatGetBlockSizes(inmat, &bs, &cbs));
4650     PetscCall(MatSetBlockSizes(*outmat, bs, cbs));
4651     PetscCall(MatGetRootType_Private(inmat, &rootType));
4652     PetscCall(MatSetType(*outmat, rootType));
4653     PetscCall(MatSeqAIJSetPreallocation(*outmat, 0, dnz));
4654     PetscCall(MatMPIAIJSetPreallocation(*outmat, 0, dnz, 0, onz));
4655     MatPreallocateEnd(dnz, onz);
4656     PetscCall(MatSetOption(*outmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
4657   }
4658 
4659   /* numeric phase */
4660   PetscCall(MatGetOwnershipRange(*outmat, &rstart, NULL));
4661   for (i = 0; i < m; i++) {
4662     PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4663     Ii = i + rstart;
4664     PetscCall(MatSetValues(*outmat, 1, &Ii, nnz, indx, values, INSERT_VALUES));
4665     PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4666   }
4667   PetscCall(MatAssemblyBegin(*outmat, MAT_FINAL_ASSEMBLY));
4668   PetscCall(MatAssemblyEnd(*outmat, MAT_FINAL_ASSEMBLY));
4669   PetscFunctionReturn(PETSC_SUCCESS);
4670 }
4671 
4672 static PetscErrorCode MatDestroy_MPIAIJ_SeqsToMPI(void *data)
4673 {
4674   Mat_Merge_SeqsToMPI *merge = (Mat_Merge_SeqsToMPI *)data;
4675 
4676   PetscFunctionBegin;
4677   if (!merge) PetscFunctionReturn(PETSC_SUCCESS);
4678   PetscCall(PetscFree(merge->id_r));
4679   PetscCall(PetscFree(merge->len_s));
4680   PetscCall(PetscFree(merge->len_r));
4681   PetscCall(PetscFree(merge->bi));
4682   PetscCall(PetscFree(merge->bj));
4683   PetscCall(PetscFree(merge->buf_ri[0]));
4684   PetscCall(PetscFree(merge->buf_ri));
4685   PetscCall(PetscFree(merge->buf_rj[0]));
4686   PetscCall(PetscFree(merge->buf_rj));
4687   PetscCall(PetscFree(merge->coi));
4688   PetscCall(PetscFree(merge->coj));
4689   PetscCall(PetscFree(merge->owners_co));
4690   PetscCall(PetscLayoutDestroy(&merge->rowmap));
4691   PetscCall(PetscFree(merge));
4692   PetscFunctionReturn(PETSC_SUCCESS);
4693 }
4694 
4695 #include <../src/mat/utils/freespace.h>
4696 #include <petscbt.h>
4697 
4698 PetscErrorCode MatCreateMPIAIJSumSeqAIJNumeric(Mat seqmat, Mat mpimat)
4699 {
4700   MPI_Comm             comm;
4701   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4702   PetscMPIInt          size, rank, taga, *len_s;
4703   PetscInt             N = mpimat->cmap->N, i, j, *owners, *ai = a->i, *aj;
4704   PetscInt             proc, m;
4705   PetscInt           **buf_ri, **buf_rj;
4706   PetscInt             k, anzi, *bj_i, *bi, *bj, arow, bnzi, nextaj;
4707   PetscInt             nrows, **buf_ri_k, **nextrow, **nextai;
4708   MPI_Request         *s_waits, *r_waits;
4709   MPI_Status          *status;
4710   const MatScalar     *aa, *a_a;
4711   MatScalar          **abuf_r, *ba_i;
4712   Mat_Merge_SeqsToMPI *merge;
4713   PetscContainer       container;
4714 
4715   PetscFunctionBegin;
4716   PetscCall(PetscObjectGetComm((PetscObject)mpimat, &comm));
4717   PetscCall(PetscLogEventBegin(MAT_Seqstompinum, seqmat, 0, 0, 0));
4718 
4719   PetscCallMPI(MPI_Comm_size(comm, &size));
4720   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4721 
4722   PetscCall(PetscObjectQuery((PetscObject)mpimat, "MatMergeSeqsToMPI", (PetscObject *)&container));
4723   PetscCheck(container, PetscObjectComm((PetscObject)mpimat), PETSC_ERR_PLIB, "Mat not created from MatCreateMPIAIJSumSeqAIJSymbolic");
4724   PetscCall(PetscContainerGetPointer(container, (void **)&merge));
4725   PetscCall(MatSeqAIJGetArrayRead(seqmat, &a_a));
4726   aa = a_a;
4727 
4728   bi     = merge->bi;
4729   bj     = merge->bj;
4730   buf_ri = merge->buf_ri;
4731   buf_rj = merge->buf_rj;
4732 
4733   PetscCall(PetscMalloc1(size, &status));
4734   owners = merge->rowmap->range;
4735   len_s  = merge->len_s;
4736 
4737   /* send and recv matrix values */
4738   PetscCall(PetscObjectGetNewTag((PetscObject)mpimat, &taga));
4739   PetscCall(PetscPostIrecvScalar(comm, taga, merge->nrecv, merge->id_r, merge->len_r, &abuf_r, &r_waits));
4740 
4741   PetscCall(PetscMalloc1(merge->nsend + 1, &s_waits));
4742   for (proc = 0, k = 0; proc < size; proc++) {
4743     if (!len_s[proc]) continue;
4744     i = owners[proc];
4745     PetscCallMPI(MPI_Isend(aa + ai[i], len_s[proc], MPIU_MATSCALAR, proc, taga, comm, s_waits + k));
4746     k++;
4747   }
4748 
4749   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, r_waits, status));
4750   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, s_waits, status));
4751   PetscCall(PetscFree(status));
4752 
4753   PetscCall(PetscFree(s_waits));
4754   PetscCall(PetscFree(r_waits));
4755 
4756   /* insert mat values of mpimat */
4757   PetscCall(PetscMalloc1(N, &ba_i));
4758   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4759 
4760   for (k = 0; k < merge->nrecv; k++) {
4761     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4762     nrows       = *(buf_ri_k[k]);
4763     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4764     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4765   }
4766 
4767   /* set values of ba */
4768   m = merge->rowmap->n;
4769   for (i = 0; i < m; i++) {
4770     arow = owners[rank] + i;
4771     bj_i = bj + bi[i]; /* col indices of the i-th row of mpimat */
4772     bnzi = bi[i + 1] - bi[i];
4773     PetscCall(PetscArrayzero(ba_i, bnzi));
4774 
4775     /* add local non-zero vals of this proc's seqmat into ba */
4776     anzi   = ai[arow + 1] - ai[arow];
4777     aj     = a->j + ai[arow];
4778     aa     = a_a + ai[arow];
4779     nextaj = 0;
4780     for (j = 0; nextaj < anzi; j++) {
4781       if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4782         ba_i[j] += aa[nextaj++];
4783       }
4784     }
4785 
4786     /* add received vals into ba */
4787     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4788       /* i-th row */
4789       if (i == *nextrow[k]) {
4790         anzi   = *(nextai[k] + 1) - *nextai[k];
4791         aj     = buf_rj[k] + *(nextai[k]);
4792         aa     = abuf_r[k] + *(nextai[k]);
4793         nextaj = 0;
4794         for (j = 0; nextaj < anzi; j++) {
4795           if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4796             ba_i[j] += aa[nextaj++];
4797           }
4798         }
4799         nextrow[k]++;
4800         nextai[k]++;
4801       }
4802     }
4803     PetscCall(MatSetValues(mpimat, 1, &arow, bnzi, bj_i, ba_i, INSERT_VALUES));
4804   }
4805   PetscCall(MatSeqAIJRestoreArrayRead(seqmat, &a_a));
4806   PetscCall(MatAssemblyBegin(mpimat, MAT_FINAL_ASSEMBLY));
4807   PetscCall(MatAssemblyEnd(mpimat, MAT_FINAL_ASSEMBLY));
4808 
4809   PetscCall(PetscFree(abuf_r[0]));
4810   PetscCall(PetscFree(abuf_r));
4811   PetscCall(PetscFree(ba_i));
4812   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4813   PetscCall(PetscLogEventEnd(MAT_Seqstompinum, seqmat, 0, 0, 0));
4814   PetscFunctionReturn(PETSC_SUCCESS);
4815 }
4816 
4817 PetscErrorCode MatCreateMPIAIJSumSeqAIJSymbolic(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, Mat *mpimat)
4818 {
4819   Mat                  B_mpi;
4820   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4821   PetscMPIInt          size, rank, tagi, tagj, *len_s, *len_si, *len_ri;
4822   PetscInt           **buf_rj, **buf_ri, **buf_ri_k;
4823   PetscInt             M = seqmat->rmap->n, N = seqmat->cmap->n, i, *owners, *ai = a->i, *aj = a->j;
4824   PetscInt             len, proc, *dnz, *onz, bs, cbs;
4825   PetscInt             k, anzi, *bi, *bj, *lnk, nlnk, arow, bnzi;
4826   PetscInt             nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextai;
4827   MPI_Request         *si_waits, *sj_waits, *ri_waits, *rj_waits;
4828   MPI_Status          *status;
4829   PetscFreeSpaceList   free_space = NULL, current_space = NULL;
4830   PetscBT              lnkbt;
4831   Mat_Merge_SeqsToMPI *merge;
4832   PetscContainer       container;
4833 
4834   PetscFunctionBegin;
4835   PetscCall(PetscLogEventBegin(MAT_Seqstompisym, seqmat, 0, 0, 0));
4836 
4837   /* make sure it is a PETSc comm */
4838   PetscCall(PetscCommDuplicate(comm, &comm, NULL));
4839   PetscCallMPI(MPI_Comm_size(comm, &size));
4840   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4841 
4842   PetscCall(PetscNew(&merge));
4843   PetscCall(PetscMalloc1(size, &status));
4844 
4845   /* determine row ownership */
4846   PetscCall(PetscLayoutCreate(comm, &merge->rowmap));
4847   PetscCall(PetscLayoutSetLocalSize(merge->rowmap, m));
4848   PetscCall(PetscLayoutSetSize(merge->rowmap, M));
4849   PetscCall(PetscLayoutSetBlockSize(merge->rowmap, 1));
4850   PetscCall(PetscLayoutSetUp(merge->rowmap));
4851   PetscCall(PetscMalloc1(size, &len_si));
4852   PetscCall(PetscMalloc1(size, &merge->len_s));
4853 
4854   m      = merge->rowmap->n;
4855   owners = merge->rowmap->range;
4856 
4857   /* determine the number of messages to send, their lengths */
4858   len_s = merge->len_s;
4859 
4860   len          = 0; /* length of buf_si[] */
4861   merge->nsend = 0;
4862   for (proc = 0; proc < size; proc++) {
4863     len_si[proc] = 0;
4864     if (proc == rank) {
4865       len_s[proc] = 0;
4866     } else {
4867       len_si[proc] = owners[proc + 1] - owners[proc] + 1;
4868       len_s[proc]  = ai[owners[proc + 1]] - ai[owners[proc]]; /* num of rows to be sent to [proc] */
4869     }
4870     if (len_s[proc]) {
4871       merge->nsend++;
4872       nrows = 0;
4873       for (i = owners[proc]; i < owners[proc + 1]; i++) {
4874         if (ai[i + 1] > ai[i]) nrows++;
4875       }
4876       len_si[proc] = 2 * (nrows + 1);
4877       len += len_si[proc];
4878     }
4879   }
4880 
4881   /* determine the number and length of messages to receive for ij-structure */
4882   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &merge->nrecv));
4883   PetscCall(PetscGatherMessageLengths2(comm, merge->nsend, merge->nrecv, len_s, len_si, &merge->id_r, &merge->len_r, &len_ri));
4884 
4885   /* post the Irecv of j-structure */
4886   PetscCall(PetscCommGetNewTag(comm, &tagj));
4887   PetscCall(PetscPostIrecvInt(comm, tagj, merge->nrecv, merge->id_r, merge->len_r, &buf_rj, &rj_waits));
4888 
4889   /* post the Isend of j-structure */
4890   PetscCall(PetscMalloc2(merge->nsend, &si_waits, merge->nsend, &sj_waits));
4891 
4892   for (proc = 0, k = 0; proc < size; proc++) {
4893     if (!len_s[proc]) continue;
4894     i = owners[proc];
4895     PetscCallMPI(MPI_Isend(aj + ai[i], len_s[proc], MPIU_INT, proc, tagj, comm, sj_waits + k));
4896     k++;
4897   }
4898 
4899   /* receives and sends of j-structure are complete */
4900   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, rj_waits, status));
4901   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, sj_waits, status));
4902 
4903   /* send and recv i-structure */
4904   PetscCall(PetscCommGetNewTag(comm, &tagi));
4905   PetscCall(PetscPostIrecvInt(comm, tagi, merge->nrecv, merge->id_r, len_ri, &buf_ri, &ri_waits));
4906 
4907   PetscCall(PetscMalloc1(len + 1, &buf_s));
4908   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
4909   for (proc = 0, k = 0; proc < size; proc++) {
4910     if (!len_s[proc]) continue;
4911     /* form outgoing message for i-structure:
4912          buf_si[0]:                 nrows to be sent
4913                [1:nrows]:           row index (global)
4914                [nrows+1:2*nrows+1]: i-structure index
4915     */
4916     nrows       = len_si[proc] / 2 - 1;
4917     buf_si_i    = buf_si + nrows + 1;
4918     buf_si[0]   = nrows;
4919     buf_si_i[0] = 0;
4920     nrows       = 0;
4921     for (i = owners[proc]; i < owners[proc + 1]; i++) {
4922       anzi = ai[i + 1] - ai[i];
4923       if (anzi) {
4924         buf_si_i[nrows + 1] = buf_si_i[nrows] + anzi; /* i-structure */
4925         buf_si[nrows + 1]   = i - owners[proc];       /* local row index */
4926         nrows++;
4927       }
4928     }
4929     PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, si_waits + k));
4930     k++;
4931     buf_si += len_si[proc];
4932   }
4933 
4934   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, ri_waits, status));
4935   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, si_waits, status));
4936 
4937   PetscCall(PetscInfo(seqmat, "nsend: %d, nrecv: %d\n", merge->nsend, merge->nrecv));
4938   for (i = 0; i < merge->nrecv; i++) PetscCall(PetscInfo(seqmat, "recv len_ri=%d, len_rj=%d from [%d]\n", len_ri[i], merge->len_r[i], merge->id_r[i]));
4939 
4940   PetscCall(PetscFree(len_si));
4941   PetscCall(PetscFree(len_ri));
4942   PetscCall(PetscFree(rj_waits));
4943   PetscCall(PetscFree2(si_waits, sj_waits));
4944   PetscCall(PetscFree(ri_waits));
4945   PetscCall(PetscFree(buf_s));
4946   PetscCall(PetscFree(status));
4947 
4948   /* compute a local seq matrix in each processor */
4949   /* allocate bi array and free space for accumulating nonzero column info */
4950   PetscCall(PetscMalloc1(m + 1, &bi));
4951   bi[0] = 0;
4952 
4953   /* create and initialize a linked list */
4954   nlnk = N + 1;
4955   PetscCall(PetscLLCreate(N, N, nlnk, lnk, lnkbt));
4956 
4957   /* initial FreeSpace size is 2*(num of local nnz(seqmat)) */
4958   len = ai[owners[rank + 1]] - ai[owners[rank]];
4959   PetscCall(PetscFreeSpaceGet(PetscIntMultTruncate(2, len) + 1, &free_space));
4960 
4961   current_space = free_space;
4962 
4963   /* determine symbolic info for each local row */
4964   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4965 
4966   for (k = 0; k < merge->nrecv; k++) {
4967     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4968     nrows       = *buf_ri_k[k];
4969     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4970     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4971   }
4972 
4973   MatPreallocateBegin(comm, m, n, dnz, onz);
4974   len = 0;
4975   for (i = 0; i < m; i++) {
4976     bnzi = 0;
4977     /* add local non-zero cols of this proc's seqmat into lnk */
4978     arow = owners[rank] + i;
4979     anzi = ai[arow + 1] - ai[arow];
4980     aj   = a->j + ai[arow];
4981     PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
4982     bnzi += nlnk;
4983     /* add received col data into lnk */
4984     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4985       if (i == *nextrow[k]) {            /* i-th row */
4986         anzi = *(nextai[k] + 1) - *nextai[k];
4987         aj   = buf_rj[k] + *nextai[k];
4988         PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
4989         bnzi += nlnk;
4990         nextrow[k]++;
4991         nextai[k]++;
4992       }
4993     }
4994     if (len < bnzi) len = bnzi; /* =max(bnzi) */
4995 
4996     /* if free space is not available, make more free space */
4997     if (current_space->local_remaining < bnzi) PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(bnzi, current_space->total_array_size), &current_space));
4998     /* copy data into free space, then initialize lnk */
4999     PetscCall(PetscLLClean(N, N, bnzi, lnk, current_space->array, lnkbt));
5000     PetscCall(MatPreallocateSet(i + owners[rank], bnzi, current_space->array, dnz, onz));
5001 
5002     current_space->array += bnzi;
5003     current_space->local_used += bnzi;
5004     current_space->local_remaining -= bnzi;
5005 
5006     bi[i + 1] = bi[i] + bnzi;
5007   }
5008 
5009   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
5010 
5011   PetscCall(PetscMalloc1(bi[m] + 1, &bj));
5012   PetscCall(PetscFreeSpaceContiguous(&free_space, bj));
5013   PetscCall(PetscLLDestroy(lnk, lnkbt));
5014 
5015   /* create symbolic parallel matrix B_mpi */
5016   PetscCall(MatGetBlockSizes(seqmat, &bs, &cbs));
5017   PetscCall(MatCreate(comm, &B_mpi));
5018   if (n == PETSC_DECIDE) {
5019     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, N));
5020   } else {
5021     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
5022   }
5023   PetscCall(MatSetBlockSizes(B_mpi, bs, cbs));
5024   PetscCall(MatSetType(B_mpi, MATMPIAIJ));
5025   PetscCall(MatMPIAIJSetPreallocation(B_mpi, 0, dnz, 0, onz));
5026   MatPreallocateEnd(dnz, onz);
5027   PetscCall(MatSetOption(B_mpi, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
5028 
5029   /* B_mpi is not ready for use - assembly will be done by MatCreateMPIAIJSumSeqAIJNumeric() */
5030   B_mpi->assembled = PETSC_FALSE;
5031   merge->bi        = bi;
5032   merge->bj        = bj;
5033   merge->buf_ri    = buf_ri;
5034   merge->buf_rj    = buf_rj;
5035   merge->coi       = NULL;
5036   merge->coj       = NULL;
5037   merge->owners_co = NULL;
5038 
5039   PetscCall(PetscCommDestroy(&comm));
5040 
5041   /* attach the supporting struct to B_mpi for reuse */
5042   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
5043   PetscCall(PetscContainerSetPointer(container, merge));
5044   PetscCall(PetscContainerSetUserDestroy(container, MatDestroy_MPIAIJ_SeqsToMPI));
5045   PetscCall(PetscObjectCompose((PetscObject)B_mpi, "MatMergeSeqsToMPI", (PetscObject)container));
5046   PetscCall(PetscContainerDestroy(&container));
5047   *mpimat = B_mpi;
5048 
5049   PetscCall(PetscLogEventEnd(MAT_Seqstompisym, seqmat, 0, 0, 0));
5050   PetscFunctionReturn(PETSC_SUCCESS);
5051 }
5052 
5053 /*@C
5054   MatCreateMPIAIJSumSeqAIJ - Creates a `MATMPIAIJ` matrix by adding sequential
5055   matrices from each processor
5056 
5057   Collective
5058 
5059   Input Parameters:
5060 + comm   - the communicators the parallel matrix will live on
5061 . seqmat - the input sequential matrices
5062 . m      - number of local rows (or `PETSC_DECIDE`)
5063 . n      - number of local columns (or `PETSC_DECIDE`)
5064 - scall  - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5065 
5066   Output Parameter:
5067 . mpimat - the parallel matrix generated
5068 
5069   Level: advanced
5070 
5071   Note:
5072   The dimensions of the sequential matrix in each processor MUST be the same.
5073   The input seqmat is included into the container "Mat_Merge_SeqsToMPI", and will be
5074   destroyed when mpimat is destroyed. Call `PetscObjectQuery()` to access seqmat.
5075 
5076 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJ()`
5077 @*/
5078 PetscErrorCode MatCreateMPIAIJSumSeqAIJ(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, MatReuse scall, Mat *mpimat)
5079 {
5080   PetscMPIInt size;
5081 
5082   PetscFunctionBegin;
5083   PetscCallMPI(MPI_Comm_size(comm, &size));
5084   if (size == 1) {
5085     PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5086     if (scall == MAT_INITIAL_MATRIX) {
5087       PetscCall(MatDuplicate(seqmat, MAT_COPY_VALUES, mpimat));
5088     } else {
5089       PetscCall(MatCopy(seqmat, *mpimat, SAME_NONZERO_PATTERN));
5090     }
5091     PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5092     PetscFunctionReturn(PETSC_SUCCESS);
5093   }
5094   PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5095   if (scall == MAT_INITIAL_MATRIX) PetscCall(MatCreateMPIAIJSumSeqAIJSymbolic(comm, seqmat, m, n, mpimat));
5096   PetscCall(MatCreateMPIAIJSumSeqAIJNumeric(seqmat, *mpimat));
5097   PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5098   PetscFunctionReturn(PETSC_SUCCESS);
5099 }
5100 
5101 /*@
5102   MatAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATAIJ` matrix.
5103 
5104   Not Collective
5105 
5106   Input Parameter:
5107 . A - the matrix
5108 
5109   Output Parameter:
5110 . A_loc - the local sequential matrix generated
5111 
5112   Level: developer
5113 
5114   Notes:
5115   The matrix is created by taking `A`'s local rows and putting them into a sequential matrix
5116   with `mlocal` rows and `n` columns. Where `mlocal` is obtained with `MatGetLocalSize()` and
5117   `n` is the global column count obtained with `MatGetSize()`
5118 
5119   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5120 
5121   For parallel matrices this creates an entirely new matrix. If the matrix is sequential it merely increases the reference count.
5122 
5123   Destroy the matrix with `MatDestroy()`
5124 
5125 .seealso: [](ch_matrices), `Mat`, `MatMPIAIJGetLocalMat()`
5126 @*/
5127 PetscErrorCode MatAIJGetLocalMat(Mat A, Mat *A_loc)
5128 {
5129   PetscBool mpi;
5130 
5131   PetscFunctionBegin;
5132   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &mpi));
5133   if (mpi) {
5134     PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, A_loc));
5135   } else {
5136     *A_loc = A;
5137     PetscCall(PetscObjectReference((PetscObject)*A_loc));
5138   }
5139   PetscFunctionReturn(PETSC_SUCCESS);
5140 }
5141 
5142 /*@
5143   MatMPIAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix.
5144 
5145   Not Collective
5146 
5147   Input Parameters:
5148 + A     - the matrix
5149 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5150 
5151   Output Parameter:
5152 . A_loc - the local sequential matrix generated
5153 
5154   Level: developer
5155 
5156   Notes:
5157   The matrix is created by taking all `A`'s local rows and putting them into a sequential
5158   matrix with `mlocal` rows and `n` columns.`mlocal` is the row count obtained with
5159   `MatGetLocalSize()` and `n` is the global column count obtained with `MatGetSize()`.
5160 
5161   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5162 
5163   When `A` is sequential and `MAT_INITIAL_MATRIX` is requested, the matrix returned is the diagonal part of `A` (which contains the entire matrix),
5164   with its reference count increased by one. Hence changing values of `A_loc` changes `A`. If `MAT_REUSE_MATRIX` is requested on a sequential matrix
5165   then `MatCopy`(Adiag,*`A_loc`,`SAME_NONZERO_PATTERN`) is called to fill `A_loc`. Thus one can preallocate the appropriate sequential matrix `A_loc`
5166   and then call this routine with `MAT_REUSE_MATRIX`. In this case, one can modify the values of `A_loc` without affecting the original sequential matrix.
5167 
5168 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMatCondensed()`, `MatMPIAIJGetLocalMatMerge()`
5169 @*/
5170 PetscErrorCode MatMPIAIJGetLocalMat(Mat A, MatReuse scall, Mat *A_loc)
5171 {
5172   Mat_MPIAIJ        *mpimat = (Mat_MPIAIJ *)A->data;
5173   Mat_SeqAIJ        *mat, *a, *b;
5174   PetscInt          *ai, *aj, *bi, *bj, *cmap = mpimat->garray;
5175   const PetscScalar *aa, *ba, *aav, *bav;
5176   PetscScalar       *ca, *cam;
5177   PetscMPIInt        size;
5178   PetscInt           am = A->rmap->n, i, j, k, cstart = A->cmap->rstart;
5179   PetscInt          *ci, *cj, col, ncols_d, ncols_o, jo;
5180   PetscBool          match;
5181 
5182   PetscFunctionBegin;
5183   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &match));
5184   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5185   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5186   if (size == 1) {
5187     if (scall == MAT_INITIAL_MATRIX) {
5188       PetscCall(PetscObjectReference((PetscObject)mpimat->A));
5189       *A_loc = mpimat->A;
5190     } else if (scall == MAT_REUSE_MATRIX) {
5191       PetscCall(MatCopy(mpimat->A, *A_loc, SAME_NONZERO_PATTERN));
5192     }
5193     PetscFunctionReturn(PETSC_SUCCESS);
5194   }
5195 
5196   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5197   a  = (Mat_SeqAIJ *)(mpimat->A)->data;
5198   b  = (Mat_SeqAIJ *)(mpimat->B)->data;
5199   ai = a->i;
5200   aj = a->j;
5201   bi = b->i;
5202   bj = b->j;
5203   PetscCall(MatSeqAIJGetArrayRead(mpimat->A, &aav));
5204   PetscCall(MatSeqAIJGetArrayRead(mpimat->B, &bav));
5205   aa = aav;
5206   ba = bav;
5207   if (scall == MAT_INITIAL_MATRIX) {
5208     PetscCall(PetscMalloc1(1 + am, &ci));
5209     ci[0] = 0;
5210     for (i = 0; i < am; i++) ci[i + 1] = ci[i] + (ai[i + 1] - ai[i]) + (bi[i + 1] - bi[i]);
5211     PetscCall(PetscMalloc1(1 + ci[am], &cj));
5212     PetscCall(PetscMalloc1(1 + ci[am], &ca));
5213     k = 0;
5214     for (i = 0; i < am; i++) {
5215       ncols_o = bi[i + 1] - bi[i];
5216       ncols_d = ai[i + 1] - ai[i];
5217       /* off-diagonal portion of A */
5218       for (jo = 0; jo < ncols_o; jo++) {
5219         col = cmap[*bj];
5220         if (col >= cstart) break;
5221         cj[k] = col;
5222         bj++;
5223         ca[k++] = *ba++;
5224       }
5225       /* diagonal portion of A */
5226       for (j = 0; j < ncols_d; j++) {
5227         cj[k]   = cstart + *aj++;
5228         ca[k++] = *aa++;
5229       }
5230       /* off-diagonal portion of A */
5231       for (j = jo; j < ncols_o; j++) {
5232         cj[k]   = cmap[*bj++];
5233         ca[k++] = *ba++;
5234       }
5235     }
5236     /* put together the new matrix */
5237     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, A->cmap->N, ci, cj, ca, A_loc));
5238     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5239     /* Since these are PETSc arrays, change flags to free them as necessary. */
5240     mat          = (Mat_SeqAIJ *)(*A_loc)->data;
5241     mat->free_a  = PETSC_TRUE;
5242     mat->free_ij = PETSC_TRUE;
5243     mat->nonew   = 0;
5244   } else if (scall == MAT_REUSE_MATRIX) {
5245     mat = (Mat_SeqAIJ *)(*A_loc)->data;
5246     ci  = mat->i;
5247     cj  = mat->j;
5248     PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &cam));
5249     for (i = 0; i < am; i++) {
5250       /* off-diagonal portion of A */
5251       ncols_o = bi[i + 1] - bi[i];
5252       for (jo = 0; jo < ncols_o; jo++) {
5253         col = cmap[*bj];
5254         if (col >= cstart) break;
5255         *cam++ = *ba++;
5256         bj++;
5257       }
5258       /* diagonal portion of A */
5259       ncols_d = ai[i + 1] - ai[i];
5260       for (j = 0; j < ncols_d; j++) *cam++ = *aa++;
5261       /* off-diagonal portion of A */
5262       for (j = jo; j < ncols_o; j++) {
5263         *cam++ = *ba++;
5264         bj++;
5265       }
5266     }
5267     PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &cam));
5268   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5269   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->A, &aav));
5270   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->B, &bav));
5271   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5272   PetscFunctionReturn(PETSC_SUCCESS);
5273 }
5274 
5275 /*@
5276   MatMPIAIJGetLocalMatMerge - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix by taking all its local rows and putting them into a sequential matrix with
5277   mlocal rows and n columns. Where n is the sum of the number of columns of the diagonal and off-diagonal part
5278 
5279   Not Collective
5280 
5281   Input Parameters:
5282 + A     - the matrix
5283 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5284 
5285   Output Parameters:
5286 + glob  - sequential `IS` with global indices associated with the columns of the local sequential matrix generated (can be `NULL`)
5287 - A_loc - the local sequential matrix generated
5288 
5289   Level: developer
5290 
5291   Note:
5292   This is different from `MatMPIAIJGetLocalMat()` since the first columns in the returning matrix are those associated with the diagonal
5293   part, then those associated with the off-diagonal part (in its local ordering)
5294 
5295 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`
5296 @*/
5297 PetscErrorCode MatMPIAIJGetLocalMatMerge(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
5298 {
5299   Mat             Ao, Ad;
5300   const PetscInt *cmap;
5301   PetscMPIInt     size;
5302   PetscErrorCode (*f)(Mat, MatReuse, IS *, Mat *);
5303 
5304   PetscFunctionBegin;
5305   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
5306   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5307   if (size == 1) {
5308     if (scall == MAT_INITIAL_MATRIX) {
5309       PetscCall(PetscObjectReference((PetscObject)Ad));
5310       *A_loc = Ad;
5311     } else if (scall == MAT_REUSE_MATRIX) {
5312       PetscCall(MatCopy(Ad, *A_loc, SAME_NONZERO_PATTERN));
5313     }
5314     if (glob) PetscCall(ISCreateStride(PetscObjectComm((PetscObject)Ad), Ad->cmap->n, Ad->cmap->rstart, 1, glob));
5315     PetscFunctionReturn(PETSC_SUCCESS);
5316   }
5317   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", &f));
5318   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5319   if (f) {
5320     PetscCall((*f)(A, scall, glob, A_loc));
5321   } else {
5322     Mat_SeqAIJ        *a = (Mat_SeqAIJ *)Ad->data;
5323     Mat_SeqAIJ        *b = (Mat_SeqAIJ *)Ao->data;
5324     Mat_SeqAIJ        *c;
5325     PetscInt          *ai = a->i, *aj = a->j;
5326     PetscInt          *bi = b->i, *bj = b->j;
5327     PetscInt          *ci, *cj;
5328     const PetscScalar *aa, *ba;
5329     PetscScalar       *ca;
5330     PetscInt           i, j, am, dn, on;
5331 
5332     PetscCall(MatGetLocalSize(Ad, &am, &dn));
5333     PetscCall(MatGetLocalSize(Ao, NULL, &on));
5334     PetscCall(MatSeqAIJGetArrayRead(Ad, &aa));
5335     PetscCall(MatSeqAIJGetArrayRead(Ao, &ba));
5336     if (scall == MAT_INITIAL_MATRIX) {
5337       PetscInt k;
5338       PetscCall(PetscMalloc1(1 + am, &ci));
5339       PetscCall(PetscMalloc1(ai[am] + bi[am], &cj));
5340       PetscCall(PetscMalloc1(ai[am] + bi[am], &ca));
5341       ci[0] = 0;
5342       for (i = 0, k = 0; i < am; i++) {
5343         const PetscInt ncols_o = bi[i + 1] - bi[i];
5344         const PetscInt ncols_d = ai[i + 1] - ai[i];
5345         ci[i + 1]              = ci[i] + ncols_o + ncols_d;
5346         /* diagonal portion of A */
5347         for (j = 0; j < ncols_d; j++, k++) {
5348           cj[k] = *aj++;
5349           ca[k] = *aa++;
5350         }
5351         /* off-diagonal portion of A */
5352         for (j = 0; j < ncols_o; j++, k++) {
5353           cj[k] = dn + *bj++;
5354           ca[k] = *ba++;
5355         }
5356       }
5357       /* put together the new matrix */
5358       PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, dn + on, ci, cj, ca, A_loc));
5359       /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5360       /* Since these are PETSc arrays, change flags to free them as necessary. */
5361       c          = (Mat_SeqAIJ *)(*A_loc)->data;
5362       c->free_a  = PETSC_TRUE;
5363       c->free_ij = PETSC_TRUE;
5364       c->nonew   = 0;
5365       PetscCall(MatSetType(*A_loc, ((PetscObject)Ad)->type_name));
5366     } else if (scall == MAT_REUSE_MATRIX) {
5367       PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &ca));
5368       for (i = 0; i < am; i++) {
5369         const PetscInt ncols_d = ai[i + 1] - ai[i];
5370         const PetscInt ncols_o = bi[i + 1] - bi[i];
5371         /* diagonal portion of A */
5372         for (j = 0; j < ncols_d; j++) *ca++ = *aa++;
5373         /* off-diagonal portion of A */
5374         for (j = 0; j < ncols_o; j++) *ca++ = *ba++;
5375       }
5376       PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &ca));
5377     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5378     PetscCall(MatSeqAIJRestoreArrayRead(Ad, &aa));
5379     PetscCall(MatSeqAIJRestoreArrayRead(Ao, &aa));
5380     if (glob) {
5381       PetscInt cst, *gidx;
5382 
5383       PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
5384       PetscCall(PetscMalloc1(dn + on, &gidx));
5385       for (i = 0; i < dn; i++) gidx[i] = cst + i;
5386       for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
5387       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
5388     }
5389   }
5390   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5391   PetscFunctionReturn(PETSC_SUCCESS);
5392 }
5393 
5394 /*@C
5395   MatMPIAIJGetLocalMatCondensed - Creates a `MATSEQAIJ` matrix from an `MATMPIAIJ` matrix by taking all its local rows and NON-ZERO columns
5396 
5397   Not Collective
5398 
5399   Input Parameters:
5400 + A     - the matrix
5401 . scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5402 . row   - index set of rows to extract (or `NULL`)
5403 - col   - index set of columns to extract (or `NULL`)
5404 
5405   Output Parameter:
5406 . A_loc - the local sequential matrix generated
5407 
5408   Level: developer
5409 
5410 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`
5411 @*/
5412 PetscErrorCode MatMPIAIJGetLocalMatCondensed(Mat A, MatReuse scall, IS *row, IS *col, Mat *A_loc)
5413 {
5414   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5415   PetscInt    i, start, end, ncols, nzA, nzB, *cmap, imark, *idx;
5416   IS          isrowa, iscola;
5417   Mat        *aloc;
5418   PetscBool   match;
5419 
5420   PetscFunctionBegin;
5421   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &match));
5422   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5423   PetscCall(PetscLogEventBegin(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5424   if (!row) {
5425     start = A->rmap->rstart;
5426     end   = A->rmap->rend;
5427     PetscCall(ISCreateStride(PETSC_COMM_SELF, end - start, start, 1, &isrowa));
5428   } else {
5429     isrowa = *row;
5430   }
5431   if (!col) {
5432     start = A->cmap->rstart;
5433     cmap  = a->garray;
5434     nzA   = a->A->cmap->n;
5435     nzB   = a->B->cmap->n;
5436     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5437     ncols = 0;
5438     for (i = 0; i < nzB; i++) {
5439       if (cmap[i] < start) idx[ncols++] = cmap[i];
5440       else break;
5441     }
5442     imark = i;
5443     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;
5444     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i];
5445     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &iscola));
5446   } else {
5447     iscola = *col;
5448   }
5449   if (scall != MAT_INITIAL_MATRIX) {
5450     PetscCall(PetscMalloc1(1, &aloc));
5451     aloc[0] = *A_loc;
5452   }
5453   PetscCall(MatCreateSubMatrices(A, 1, &isrowa, &iscola, scall, &aloc));
5454   if (!col) { /* attach global id of condensed columns */
5455     PetscCall(PetscObjectCompose((PetscObject)aloc[0], "_petsc_GetLocalMatCondensed_iscol", (PetscObject)iscola));
5456   }
5457   *A_loc = aloc[0];
5458   PetscCall(PetscFree(aloc));
5459   if (!row) PetscCall(ISDestroy(&isrowa));
5460   if (!col) PetscCall(ISDestroy(&iscola));
5461   PetscCall(PetscLogEventEnd(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5462   PetscFunctionReturn(PETSC_SUCCESS);
5463 }
5464 
5465 /*
5466  * Create a sequential AIJ matrix based on row indices. a whole column is extracted once a row is matched.
5467  * Row could be local or remote.The routine is designed to be scalable in memory so that nothing is based
5468  * on a global size.
5469  * */
5470 static PetscErrorCode MatCreateSeqSubMatrixWithRows_Private(Mat P, IS rows, Mat *P_oth)
5471 {
5472   Mat_MPIAIJ            *p  = (Mat_MPIAIJ *)P->data;
5473   Mat_SeqAIJ            *pd = (Mat_SeqAIJ *)(p->A)->data, *po = (Mat_SeqAIJ *)(p->B)->data, *p_oth;
5474   PetscInt               plocalsize, nrows, *ilocal, *oilocal, i, lidx, *nrcols, *nlcols, ncol;
5475   PetscMPIInt            owner;
5476   PetscSFNode           *iremote, *oiremote;
5477   const PetscInt        *lrowindices;
5478   PetscSF                sf, osf;
5479   PetscInt               pcstart, *roffsets, *loffsets, *pnnz, j;
5480   PetscInt               ontotalcols, dntotalcols, ntotalcols, nout;
5481   MPI_Comm               comm;
5482   ISLocalToGlobalMapping mapping;
5483   const PetscScalar     *pd_a, *po_a;
5484 
5485   PetscFunctionBegin;
5486   PetscCall(PetscObjectGetComm((PetscObject)P, &comm));
5487   /* plocalsize is the number of roots
5488    * nrows is the number of leaves
5489    * */
5490   PetscCall(MatGetLocalSize(P, &plocalsize, NULL));
5491   PetscCall(ISGetLocalSize(rows, &nrows));
5492   PetscCall(PetscCalloc1(nrows, &iremote));
5493   PetscCall(ISGetIndices(rows, &lrowindices));
5494   for (i = 0; i < nrows; i++) {
5495     /* Find a remote index and an owner for a row
5496      * The row could be local or remote
5497      * */
5498     owner = 0;
5499     lidx  = 0;
5500     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, &lidx));
5501     iremote[i].index = lidx;
5502     iremote[i].rank  = owner;
5503   }
5504   /* Create SF to communicate how many nonzero columns for each row */
5505   PetscCall(PetscSFCreate(comm, &sf));
5506   /* SF will figure out the number of nonzero columns for each row, and their
5507    * offsets
5508    * */
5509   PetscCall(PetscSFSetGraph(sf, plocalsize, nrows, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5510   PetscCall(PetscSFSetFromOptions(sf));
5511   PetscCall(PetscSFSetUp(sf));
5512 
5513   PetscCall(PetscCalloc1(2 * (plocalsize + 1), &roffsets));
5514   PetscCall(PetscCalloc1(2 * plocalsize, &nrcols));
5515   PetscCall(PetscCalloc1(nrows, &pnnz));
5516   roffsets[0] = 0;
5517   roffsets[1] = 0;
5518   for (i = 0; i < plocalsize; i++) {
5519     /* diagonal */
5520     nrcols[i * 2 + 0] = pd->i[i + 1] - pd->i[i];
5521     /* off-diagonal */
5522     nrcols[i * 2 + 1] = po->i[i + 1] - po->i[i];
5523     /* compute offsets so that we relative location for each row */
5524     roffsets[(i + 1) * 2 + 0] = roffsets[i * 2 + 0] + nrcols[i * 2 + 0];
5525     roffsets[(i + 1) * 2 + 1] = roffsets[i * 2 + 1] + nrcols[i * 2 + 1];
5526   }
5527   PetscCall(PetscCalloc1(2 * nrows, &nlcols));
5528   PetscCall(PetscCalloc1(2 * nrows, &loffsets));
5529   /* 'r' means root, and 'l' means leaf */
5530   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5531   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5532   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5533   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5534   PetscCall(PetscSFDestroy(&sf));
5535   PetscCall(PetscFree(roffsets));
5536   PetscCall(PetscFree(nrcols));
5537   dntotalcols = 0;
5538   ontotalcols = 0;
5539   ncol        = 0;
5540   for (i = 0; i < nrows; i++) {
5541     pnnz[i] = nlcols[i * 2 + 0] + nlcols[i * 2 + 1];
5542     ncol    = PetscMax(pnnz[i], ncol);
5543     /* diagonal */
5544     dntotalcols += nlcols[i * 2 + 0];
5545     /* off-diagonal */
5546     ontotalcols += nlcols[i * 2 + 1];
5547   }
5548   /* We do not need to figure the right number of columns
5549    * since all the calculations will be done by going through the raw data
5550    * */
5551   PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, nrows, ncol, 0, pnnz, P_oth));
5552   PetscCall(MatSetUp(*P_oth));
5553   PetscCall(PetscFree(pnnz));
5554   p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5555   /* diagonal */
5556   PetscCall(PetscCalloc1(dntotalcols, &iremote));
5557   /* off-diagonal */
5558   PetscCall(PetscCalloc1(ontotalcols, &oiremote));
5559   /* diagonal */
5560   PetscCall(PetscCalloc1(dntotalcols, &ilocal));
5561   /* off-diagonal */
5562   PetscCall(PetscCalloc1(ontotalcols, &oilocal));
5563   dntotalcols = 0;
5564   ontotalcols = 0;
5565   ntotalcols  = 0;
5566   for (i = 0; i < nrows; i++) {
5567     owner = 0;
5568     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, NULL));
5569     /* Set iremote for diag matrix */
5570     for (j = 0; j < nlcols[i * 2 + 0]; j++) {
5571       iremote[dntotalcols].index = loffsets[i * 2 + 0] + j;
5572       iremote[dntotalcols].rank  = owner;
5573       /* P_oth is seqAIJ so that ilocal need to point to the first part of memory */
5574       ilocal[dntotalcols++] = ntotalcols++;
5575     }
5576     /* off-diagonal */
5577     for (j = 0; j < nlcols[i * 2 + 1]; j++) {
5578       oiremote[ontotalcols].index = loffsets[i * 2 + 1] + j;
5579       oiremote[ontotalcols].rank  = owner;
5580       oilocal[ontotalcols++]      = ntotalcols++;
5581     }
5582   }
5583   PetscCall(ISRestoreIndices(rows, &lrowindices));
5584   PetscCall(PetscFree(loffsets));
5585   PetscCall(PetscFree(nlcols));
5586   PetscCall(PetscSFCreate(comm, &sf));
5587   /* P serves as roots and P_oth is leaves
5588    * Diag matrix
5589    * */
5590   PetscCall(PetscSFSetGraph(sf, pd->i[plocalsize], dntotalcols, ilocal, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5591   PetscCall(PetscSFSetFromOptions(sf));
5592   PetscCall(PetscSFSetUp(sf));
5593 
5594   PetscCall(PetscSFCreate(comm, &osf));
5595   /* off-diagonal */
5596   PetscCall(PetscSFSetGraph(osf, po->i[plocalsize], ontotalcols, oilocal, PETSC_OWN_POINTER, oiremote, PETSC_OWN_POINTER));
5597   PetscCall(PetscSFSetFromOptions(osf));
5598   PetscCall(PetscSFSetUp(osf));
5599   PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5600   PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5601   /* operate on the matrix internal data to save memory */
5602   PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5603   PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5604   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, NULL));
5605   /* Convert to global indices for diag matrix */
5606   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] += pcstart;
5607   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5608   /* We want P_oth store global indices */
5609   PetscCall(ISLocalToGlobalMappingCreate(comm, 1, p->B->cmap->n, p->garray, PETSC_COPY_VALUES, &mapping));
5610   /* Use memory scalable approach */
5611   PetscCall(ISLocalToGlobalMappingSetType(mapping, ISLOCALTOGLOBALMAPPINGHASH));
5612   PetscCall(ISLocalToGlobalMappingApply(mapping, po->i[plocalsize], po->j, po->j));
5613   PetscCall(PetscSFBcastBegin(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5614   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5615   /* Convert back to local indices */
5616   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] -= pcstart;
5617   PetscCall(PetscSFBcastEnd(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5618   nout = 0;
5619   PetscCall(ISGlobalToLocalMappingApply(mapping, IS_GTOLM_DROP, po->i[plocalsize], po->j, &nout, po->j));
5620   PetscCheck(nout == po->i[plocalsize], comm, PETSC_ERR_ARG_INCOMP, "n %" PetscInt_FMT " does not equal to nout %" PetscInt_FMT " ", po->i[plocalsize], nout);
5621   PetscCall(ISLocalToGlobalMappingDestroy(&mapping));
5622   /* Exchange values */
5623   PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5624   PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5625   PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5626   PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5627   /* Stop PETSc from shrinking memory */
5628   for (i = 0; i < nrows; i++) p_oth->ilen[i] = p_oth->imax[i];
5629   PetscCall(MatAssemblyBegin(*P_oth, MAT_FINAL_ASSEMBLY));
5630   PetscCall(MatAssemblyEnd(*P_oth, MAT_FINAL_ASSEMBLY));
5631   /* Attach PetscSF objects to P_oth so that we can reuse it later */
5632   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "diagsf", (PetscObject)sf));
5633   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "offdiagsf", (PetscObject)osf));
5634   PetscCall(PetscSFDestroy(&sf));
5635   PetscCall(PetscSFDestroy(&osf));
5636   PetscFunctionReturn(PETSC_SUCCESS);
5637 }
5638 
5639 /*
5640  * Creates a SeqAIJ matrix by taking rows of B that equal to nonzero columns of local A
5641  * This supports MPIAIJ and MAIJ
5642  * */
5643 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat A, Mat P, PetscInt dof, MatReuse reuse, Mat *P_oth)
5644 {
5645   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5646   Mat_SeqAIJ *p_oth;
5647   IS          rows, map;
5648   PetscHMapI  hamp;
5649   PetscInt    i, htsize, *rowindices, off, *mapping, key, count;
5650   MPI_Comm    comm;
5651   PetscSF     sf, osf;
5652   PetscBool   has;
5653 
5654   PetscFunctionBegin;
5655   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5656   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, P, 0, 0));
5657   /* If it is the first time, create an index set of off-diag nonzero columns of A,
5658    *  and then create a submatrix (that often is an overlapping matrix)
5659    * */
5660   if (reuse == MAT_INITIAL_MATRIX) {
5661     /* Use a hash table to figure out unique keys */
5662     PetscCall(PetscHMapICreateWithSize(a->B->cmap->n, &hamp));
5663     PetscCall(PetscCalloc1(a->B->cmap->n, &mapping));
5664     count = 0;
5665     /* Assume that  a->g is sorted, otherwise the following does not make sense */
5666     for (i = 0; i < a->B->cmap->n; i++) {
5667       key = a->garray[i] / dof;
5668       PetscCall(PetscHMapIHas(hamp, key, &has));
5669       if (!has) {
5670         mapping[i] = count;
5671         PetscCall(PetscHMapISet(hamp, key, count++));
5672       } else {
5673         /* Current 'i' has the same value the previous step */
5674         mapping[i] = count - 1;
5675       }
5676     }
5677     PetscCall(ISCreateGeneral(comm, a->B->cmap->n, mapping, PETSC_OWN_POINTER, &map));
5678     PetscCall(PetscHMapIGetSize(hamp, &htsize));
5679     PetscCheck(htsize == count, comm, PETSC_ERR_ARG_INCOMP, " Size of hash map %" PetscInt_FMT " is inconsistent with count %" PetscInt_FMT, htsize, count);
5680     PetscCall(PetscCalloc1(htsize, &rowindices));
5681     off = 0;
5682     PetscCall(PetscHMapIGetKeys(hamp, &off, rowindices));
5683     PetscCall(PetscHMapIDestroy(&hamp));
5684     PetscCall(PetscSortInt(htsize, rowindices));
5685     PetscCall(ISCreateGeneral(comm, htsize, rowindices, PETSC_OWN_POINTER, &rows));
5686     /* In case, the matrix was already created but users want to recreate the matrix */
5687     PetscCall(MatDestroy(P_oth));
5688     PetscCall(MatCreateSeqSubMatrixWithRows_Private(P, rows, P_oth));
5689     PetscCall(PetscObjectCompose((PetscObject)*P_oth, "aoffdiagtopothmapping", (PetscObject)map));
5690     PetscCall(ISDestroy(&map));
5691     PetscCall(ISDestroy(&rows));
5692   } else if (reuse == MAT_REUSE_MATRIX) {
5693     /* If matrix was already created, we simply update values using SF objects
5694      * that as attached to the matrix earlier.
5695      */
5696     const PetscScalar *pd_a, *po_a;
5697 
5698     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "diagsf", (PetscObject *)&sf));
5699     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "offdiagsf", (PetscObject *)&osf));
5700     PetscCheck(sf && osf, comm, PETSC_ERR_ARG_NULL, "Matrix is not initialized yet");
5701     p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5702     /* Update values in place */
5703     PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5704     PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5705     PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5706     PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5707     PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5708     PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5709     PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5710     PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5711   } else SETERRQ(comm, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unknown reuse type");
5712   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, P, 0, 0));
5713   PetscFunctionReturn(PETSC_SUCCESS);
5714 }
5715 
5716 /*@C
5717   MatGetBrowsOfAcols - Returns `IS` that contain rows of `B` that equal to nonzero columns of local `A`
5718 
5719   Collective
5720 
5721   Input Parameters:
5722 + A     - the first matrix in `MATMPIAIJ` format
5723 . B     - the second matrix in `MATMPIAIJ` format
5724 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5725 
5726   Output Parameters:
5727 + rowb  - On input index sets of rows of B to extract (or `NULL`), modified on output
5728 . colb  - On input index sets of columns of B to extract (or `NULL`), modified on output
5729 - B_seq - the sequential matrix generated
5730 
5731   Level: developer
5732 
5733 .seealso: `Mat`, `MATMPIAIJ`, `IS`, `MatReuse`
5734 @*/
5735 PetscErrorCode MatGetBrowsOfAcols(Mat A, Mat B, MatReuse scall, IS *rowb, IS *colb, Mat *B_seq)
5736 {
5737   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5738   PetscInt   *idx, i, start, ncols, nzA, nzB, *cmap, imark;
5739   IS          isrowb, iscolb;
5740   Mat        *bseq = NULL;
5741 
5742   PetscFunctionBegin;
5743   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5744              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5745   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAcols, A, B, 0, 0));
5746 
5747   if (scall == MAT_INITIAL_MATRIX) {
5748     start = A->cmap->rstart;
5749     cmap  = a->garray;
5750     nzA   = a->A->cmap->n;
5751     nzB   = a->B->cmap->n;
5752     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5753     ncols = 0;
5754     for (i = 0; i < nzB; i++) { /* row < local row index */
5755       if (cmap[i] < start) idx[ncols++] = cmap[i];
5756       else break;
5757     }
5758     imark = i;
5759     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;   /* local rows */
5760     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i]; /* row > local row index */
5761     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &isrowb));
5762     PetscCall(ISCreateStride(PETSC_COMM_SELF, B->cmap->N, 0, 1, &iscolb));
5763   } else {
5764     PetscCheck(rowb && colb, PETSC_COMM_SELF, PETSC_ERR_SUP, "IS rowb and colb must be provided for MAT_REUSE_MATRIX");
5765     isrowb = *rowb;
5766     iscolb = *colb;
5767     PetscCall(PetscMalloc1(1, &bseq));
5768     bseq[0] = *B_seq;
5769   }
5770   PetscCall(MatCreateSubMatrices(B, 1, &isrowb, &iscolb, scall, &bseq));
5771   *B_seq = bseq[0];
5772   PetscCall(PetscFree(bseq));
5773   if (!rowb) {
5774     PetscCall(ISDestroy(&isrowb));
5775   } else {
5776     *rowb = isrowb;
5777   }
5778   if (!colb) {
5779     PetscCall(ISDestroy(&iscolb));
5780   } else {
5781     *colb = iscolb;
5782   }
5783   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAcols, A, B, 0, 0));
5784   PetscFunctionReturn(PETSC_SUCCESS);
5785 }
5786 
5787 /*
5788     MatGetBrowsOfAoCols_MPIAIJ - Creates a `MATSEQAIJ` matrix by taking rows of B that equal to nonzero columns
5789     of the OFF-DIAGONAL portion of local A
5790 
5791     Collective
5792 
5793    Input Parameters:
5794 +    A,B - the matrices in `MATMPIAIJ` format
5795 -    scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5796 
5797    Output Parameter:
5798 +    startsj_s - starting point in B's sending j-arrays, saved for MAT_REUSE (or NULL)
5799 .    startsj_r - starting point in B's receiving j-arrays, saved for MAT_REUSE (or NULL)
5800 .    bufa_ptr - array for sending matrix values, saved for MAT_REUSE (or NULL)
5801 -    B_oth - the sequential matrix generated with size aBn=a->B->cmap->n by B->cmap->N
5802 
5803     Developer Note:
5804     This directly accesses information inside the VecScatter associated with the matrix-vector product
5805      for this matrix. This is not desirable..
5806 
5807     Level: developer
5808 
5809 */
5810 PetscErrorCode MatGetBrowsOfAoCols_MPIAIJ(Mat A, Mat B, MatReuse scall, PetscInt **startsj_s, PetscInt **startsj_r, MatScalar **bufa_ptr, Mat *B_oth)
5811 {
5812   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
5813   Mat_SeqAIJ        *b_oth;
5814   VecScatter         ctx;
5815   MPI_Comm           comm;
5816   const PetscMPIInt *rprocs, *sprocs;
5817   const PetscInt    *srow, *rstarts, *sstarts;
5818   PetscInt          *rowlen, *bufj, *bufJ, ncols = 0, aBn = a->B->cmap->n, row, *b_othi, *b_othj, *rvalues = NULL, *svalues = NULL, *cols, sbs, rbs;
5819   PetscInt           i, j, k = 0, l, ll, nrecvs, nsends, nrows, *rstartsj = NULL, *sstartsj, len;
5820   PetscScalar       *b_otha, *bufa, *bufA, *vals = NULL;
5821   MPI_Request       *reqs = NULL, *rwaits = NULL, *swaits = NULL;
5822   PetscMPIInt        size, tag, rank, nreqs;
5823 
5824   PetscFunctionBegin;
5825   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5826   PetscCallMPI(MPI_Comm_size(comm, &size));
5827 
5828   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5829              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5830   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, B, 0, 0));
5831   PetscCallMPI(MPI_Comm_rank(comm, &rank));
5832 
5833   if (size == 1) {
5834     startsj_s = NULL;
5835     bufa_ptr  = NULL;
5836     *B_oth    = NULL;
5837     PetscFunctionReturn(PETSC_SUCCESS);
5838   }
5839 
5840   ctx = a->Mvctx;
5841   tag = ((PetscObject)ctx)->tag;
5842 
5843   PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &srow, &sprocs, &sbs));
5844   /* rprocs[] must be ordered so that indices received from them are ordered in rvalues[], which is key to algorithms used in this subroutine */
5845   PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL /*indices not needed*/, &rprocs, &rbs));
5846   PetscCall(PetscMPIIntCast(nsends + nrecvs, &nreqs));
5847   PetscCall(PetscMalloc1(nreqs, &reqs));
5848   rwaits = reqs;
5849   swaits = reqs + nrecvs;
5850 
5851   if (!startsj_s || !bufa_ptr) scall = MAT_INITIAL_MATRIX;
5852   if (scall == MAT_INITIAL_MATRIX) {
5853     /* i-array */
5854     /*  post receives */
5855     if (nrecvs) PetscCall(PetscMalloc1(rbs * (rstarts[nrecvs] - rstarts[0]), &rvalues)); /* rstarts can be NULL when nrecvs=0 */
5856     for (i = 0; i < nrecvs; i++) {
5857       rowlen = rvalues + rstarts[i] * rbs;
5858       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of indices to be received */
5859       PetscCallMPI(MPI_Irecv(rowlen, nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5860     }
5861 
5862     /* pack the outgoing message */
5863     PetscCall(PetscMalloc2(nsends + 1, &sstartsj, nrecvs + 1, &rstartsj));
5864 
5865     sstartsj[0] = 0;
5866     rstartsj[0] = 0;
5867     len         = 0; /* total length of j or a array to be sent */
5868     if (nsends) {
5869       k = sstarts[0]; /* ATTENTION: sstarts[0] and rstarts[0] are not necessarily zero */
5870       PetscCall(PetscMalloc1(sbs * (sstarts[nsends] - sstarts[0]), &svalues));
5871     }
5872     for (i = 0; i < nsends; i++) {
5873       rowlen = svalues + (sstarts[i] - sstarts[0]) * sbs;
5874       nrows  = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5875       for (j = 0; j < nrows; j++) {
5876         row = srow[k] + B->rmap->range[rank]; /* global row idx */
5877         for (l = 0; l < sbs; l++) {
5878           PetscCall(MatGetRow_MPIAIJ(B, row + l, &ncols, NULL, NULL)); /* rowlength */
5879 
5880           rowlen[j * sbs + l] = ncols;
5881 
5882           len += ncols;
5883           PetscCall(MatRestoreRow_MPIAIJ(B, row + l, &ncols, NULL, NULL));
5884         }
5885         k++;
5886       }
5887       PetscCallMPI(MPI_Isend(rowlen, nrows * sbs, MPIU_INT, sprocs[i], tag, comm, swaits + i));
5888 
5889       sstartsj[i + 1] = len; /* starting point of (i+1)-th outgoing msg in bufj and bufa */
5890     }
5891     /* recvs and sends of i-array are completed */
5892     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5893     PetscCall(PetscFree(svalues));
5894 
5895     /* allocate buffers for sending j and a arrays */
5896     PetscCall(PetscMalloc1(len + 1, &bufj));
5897     PetscCall(PetscMalloc1(len + 1, &bufa));
5898 
5899     /* create i-array of B_oth */
5900     PetscCall(PetscMalloc1(aBn + 2, &b_othi));
5901 
5902     b_othi[0] = 0;
5903     len       = 0; /* total length of j or a array to be received */
5904     k         = 0;
5905     for (i = 0; i < nrecvs; i++) {
5906       rowlen = rvalues + (rstarts[i] - rstarts[0]) * rbs;
5907       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of rows to be received */
5908       for (j = 0; j < nrows; j++) {
5909         b_othi[k + 1] = b_othi[k] + rowlen[j];
5910         PetscCall(PetscIntSumError(rowlen[j], len, &len));
5911         k++;
5912       }
5913       rstartsj[i + 1] = len; /* starting point of (i+1)-th incoming msg in bufj and bufa */
5914     }
5915     PetscCall(PetscFree(rvalues));
5916 
5917     /* allocate space for j and a arrays of B_oth */
5918     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_othj));
5919     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_otha));
5920 
5921     /* j-array */
5922     /*  post receives of j-array */
5923     for (i = 0; i < nrecvs; i++) {
5924       nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5925       PetscCallMPI(MPI_Irecv(b_othj + rstartsj[i], nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5926     }
5927 
5928     /* pack the outgoing message j-array */
5929     if (nsends) k = sstarts[0];
5930     for (i = 0; i < nsends; i++) {
5931       nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5932       bufJ  = bufj + sstartsj[i];
5933       for (j = 0; j < nrows; j++) {
5934         row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5935         for (ll = 0; ll < sbs; ll++) {
5936           PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5937           for (l = 0; l < ncols; l++) *bufJ++ = cols[l];
5938           PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5939         }
5940       }
5941       PetscCallMPI(MPI_Isend(bufj + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_INT, sprocs[i], tag, comm, swaits + i));
5942     }
5943 
5944     /* recvs and sends of j-array are completed */
5945     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5946   } else if (scall == MAT_REUSE_MATRIX) {
5947     sstartsj = *startsj_s;
5948     rstartsj = *startsj_r;
5949     bufa     = *bufa_ptr;
5950     b_oth    = (Mat_SeqAIJ *)(*B_oth)->data;
5951     PetscCall(MatSeqAIJGetArrayWrite(*B_oth, &b_otha));
5952   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not possess an object container");
5953 
5954   /* a-array */
5955   /*  post receives of a-array */
5956   for (i = 0; i < nrecvs; i++) {
5957     nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5958     PetscCallMPI(MPI_Irecv(b_otha + rstartsj[i], nrows, MPIU_SCALAR, rprocs[i], tag, comm, rwaits + i));
5959   }
5960 
5961   /* pack the outgoing message a-array */
5962   if (nsends) k = sstarts[0];
5963   for (i = 0; i < nsends; i++) {
5964     nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5965     bufA  = bufa + sstartsj[i];
5966     for (j = 0; j < nrows; j++) {
5967       row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5968       for (ll = 0; ll < sbs; ll++) {
5969         PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5970         for (l = 0; l < ncols; l++) *bufA++ = vals[l];
5971         PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5972       }
5973     }
5974     PetscCallMPI(MPI_Isend(bufa + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_SCALAR, sprocs[i], tag, comm, swaits + i));
5975   }
5976   /* recvs and sends of a-array are completed */
5977   if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5978   PetscCall(PetscFree(reqs));
5979 
5980   if (scall == MAT_INITIAL_MATRIX) {
5981     /* put together the new matrix */
5982     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, aBn, B->cmap->N, b_othi, b_othj, b_otha, B_oth));
5983 
5984     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5985     /* Since these are PETSc arrays, change flags to free them as necessary. */
5986     b_oth          = (Mat_SeqAIJ *)(*B_oth)->data;
5987     b_oth->free_a  = PETSC_TRUE;
5988     b_oth->free_ij = PETSC_TRUE;
5989     b_oth->nonew   = 0;
5990 
5991     PetscCall(PetscFree(bufj));
5992     if (!startsj_s || !bufa_ptr) {
5993       PetscCall(PetscFree2(sstartsj, rstartsj));
5994       PetscCall(PetscFree(bufa_ptr));
5995     } else {
5996       *startsj_s = sstartsj;
5997       *startsj_r = rstartsj;
5998       *bufa_ptr  = bufa;
5999     }
6000   } else if (scall == MAT_REUSE_MATRIX) {
6001     PetscCall(MatSeqAIJRestoreArrayWrite(*B_oth, &b_otha));
6002   }
6003 
6004   PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE, &nsends, &sstarts, &srow, &sprocs, &sbs));
6005   PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE, &nrecvs, &rstarts, NULL, &rprocs, &rbs));
6006   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, B, 0, 0));
6007   PetscFunctionReturn(PETSC_SUCCESS);
6008 }
6009 
6010 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCRL(Mat, MatType, MatReuse, Mat *);
6011 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJPERM(Mat, MatType, MatReuse, Mat *);
6012 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJSELL(Mat, MatType, MatReuse, Mat *);
6013 #if defined(PETSC_HAVE_MKL_SPARSE)
6014 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJMKL(Mat, MatType, MatReuse, Mat *);
6015 #endif
6016 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIBAIJ(Mat, MatType, MatReuse, Mat *);
6017 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISBAIJ(Mat, MatType, MatReuse, Mat *);
6018 #if defined(PETSC_HAVE_ELEMENTAL)
6019 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_Elemental(Mat, MatType, MatReuse, Mat *);
6020 #endif
6021 #if defined(PETSC_HAVE_SCALAPACK)
6022 PETSC_INTERN PetscErrorCode MatConvert_AIJ_ScaLAPACK(Mat, MatType, MatReuse, Mat *);
6023 #endif
6024 #if defined(PETSC_HAVE_HYPRE)
6025 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
6026 #endif
6027 #if defined(PETSC_HAVE_CUDA)
6028 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
6029 #endif
6030 #if defined(PETSC_HAVE_HIP)
6031 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
6032 #endif
6033 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6034 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat, MatType, MatReuse, Mat *);
6035 #endif
6036 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISELL(Mat, MatType, MatReuse, Mat *);
6037 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
6038 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_IS_XAIJ(Mat);
6039 
6040 /*
6041     Computes (B'*A')' since computing B*A directly is untenable
6042 
6043                n                       p                          p
6044         [             ]       [             ]         [                 ]
6045       m [      A      ]  *  n [       B     ]   =   m [         C       ]
6046         [             ]       [             ]         [                 ]
6047 
6048 */
6049 static PetscErrorCode MatMatMultNumeric_MPIDense_MPIAIJ(Mat A, Mat B, Mat C)
6050 {
6051   Mat At, Bt, Ct;
6052 
6053   PetscFunctionBegin;
6054   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
6055   PetscCall(MatTranspose(B, MAT_INITIAL_MATRIX, &Bt));
6056   PetscCall(MatMatMult(Bt, At, MAT_INITIAL_MATRIX, PETSC_DEFAULT, &Ct));
6057   PetscCall(MatDestroy(&At));
6058   PetscCall(MatDestroy(&Bt));
6059   PetscCall(MatTransposeSetPrecursor(Ct, C));
6060   PetscCall(MatTranspose(Ct, MAT_REUSE_MATRIX, &C));
6061   PetscCall(MatDestroy(&Ct));
6062   PetscFunctionReturn(PETSC_SUCCESS);
6063 }
6064 
6065 static PetscErrorCode MatMatMultSymbolic_MPIDense_MPIAIJ(Mat A, Mat B, PetscReal fill, Mat C)
6066 {
6067   PetscBool cisdense;
6068 
6069   PetscFunctionBegin;
6070   PetscCheck(A->cmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "A->cmap->n %" PetscInt_FMT " != B->rmap->n %" PetscInt_FMT, A->cmap->n, B->rmap->n);
6071   PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
6072   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6073   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSECUDA, MATMPIDENSEHIP, ""));
6074   if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
6075   PetscCall(MatSetUp(C));
6076 
6077   C->ops->matmultnumeric = MatMatMultNumeric_MPIDense_MPIAIJ;
6078   PetscFunctionReturn(PETSC_SUCCESS);
6079 }
6080 
6081 static PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ_AB(Mat C)
6082 {
6083   Mat_Product *product = C->product;
6084   Mat          A = product->A, B = product->B;
6085 
6086   PetscFunctionBegin;
6087   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
6088              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
6089   C->ops->matmultsymbolic = MatMatMultSymbolic_MPIDense_MPIAIJ;
6090   C->ops->productsymbolic = MatProductSymbolic_AB;
6091   PetscFunctionReturn(PETSC_SUCCESS);
6092 }
6093 
6094 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ(Mat C)
6095 {
6096   Mat_Product *product = C->product;
6097 
6098   PetscFunctionBegin;
6099   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_MPIDense_MPIAIJ_AB(C));
6100   PetscFunctionReturn(PETSC_SUCCESS);
6101 }
6102 
6103 /*
6104    Merge two sets of sorted nonzeros and return a CSR for the merged (sequential) matrix
6105 
6106   Input Parameters:
6107 
6108     j1,rowBegin1,rowEnd1,jmap1: describe the first set of nonzeros (Set1)
6109     j2,rowBegin2,rowEnd2,jmap2: describe the second set of nonzeros (Set2)
6110 
6111     mat: both sets' nonzeros are on m rows, where m is the number of local rows of the matrix mat
6112 
6113     For Set1, j1[] contains column indices of the nonzeros.
6114     For the k-th row (0<=k<m), [rowBegin1[k],rowEnd1[k]) index into j1[] and point to the begin/end nonzero in row k
6115     respectively (note rowEnd1[k] is not necessarily equal to rwoBegin1[k+1]). Indices in this range of j1[] are sorted,
6116     but might have repeats. jmap1[t+1] - jmap1[t] is the number of repeats for the t-th unique nonzero in Set1.
6117 
6118     Similar for Set2.
6119 
6120     This routine merges the two sets of nonzeros row by row and removes repeats.
6121 
6122   Output Parameters: (memory is allocated by the caller)
6123 
6124     i[],j[]: the CSR of the merged matrix, which has m rows.
6125     imap1[]: the k-th unique nonzero in Set1 (k=0,1,...) corresponds to imap1[k]-th unique nonzero in the merged matrix.
6126     imap2[]: similar to imap1[], but for Set2.
6127     Note we order nonzeros row-by-row and from left to right.
6128 */
6129 static PetscErrorCode MatMergeEntries_Internal(Mat mat, const PetscInt j1[], const PetscInt j2[], const PetscCount rowBegin1[], const PetscCount rowEnd1[], const PetscCount rowBegin2[], const PetscCount rowEnd2[], const PetscCount jmap1[], const PetscCount jmap2[], PetscCount imap1[], PetscCount imap2[], PetscInt i[], PetscInt j[])
6130 {
6131   PetscInt   r, m; /* Row index of mat */
6132   PetscCount t, t1, t2, b1, e1, b2, e2;
6133 
6134   PetscFunctionBegin;
6135   PetscCall(MatGetLocalSize(mat, &m, NULL));
6136   t1 = t2 = t = 0; /* Count unique nonzeros of in Set1, Set1 and the merged respectively */
6137   i[0]        = 0;
6138   for (r = 0; r < m; r++) { /* Do row by row merging */
6139     b1 = rowBegin1[r];
6140     e1 = rowEnd1[r];
6141     b2 = rowBegin2[r];
6142     e2 = rowEnd2[r];
6143     while (b1 < e1 && b2 < e2) {
6144       if (j1[b1] == j2[b2]) { /* Same column index and hence same nonzero */
6145         j[t]      = j1[b1];
6146         imap1[t1] = t;
6147         imap2[t2] = t;
6148         b1 += jmap1[t1 + 1] - jmap1[t1]; /* Jump to next unique local nonzero */
6149         b2 += jmap2[t2 + 1] - jmap2[t2]; /* Jump to next unique remote nonzero */
6150         t1++;
6151         t2++;
6152         t++;
6153       } else if (j1[b1] < j2[b2]) {
6154         j[t]      = j1[b1];
6155         imap1[t1] = t;
6156         b1 += jmap1[t1 + 1] - jmap1[t1];
6157         t1++;
6158         t++;
6159       } else {
6160         j[t]      = j2[b2];
6161         imap2[t2] = t;
6162         b2 += jmap2[t2 + 1] - jmap2[t2];
6163         t2++;
6164         t++;
6165       }
6166     }
6167     /* Merge the remaining in either j1[] or j2[] */
6168     while (b1 < e1) {
6169       j[t]      = j1[b1];
6170       imap1[t1] = t;
6171       b1 += jmap1[t1 + 1] - jmap1[t1];
6172       t1++;
6173       t++;
6174     }
6175     while (b2 < e2) {
6176       j[t]      = j2[b2];
6177       imap2[t2] = t;
6178       b2 += jmap2[t2 + 1] - jmap2[t2];
6179       t2++;
6180       t++;
6181     }
6182     i[r + 1] = t;
6183   }
6184   PetscFunctionReturn(PETSC_SUCCESS);
6185 }
6186 
6187 /*
6188   Split nonzeros in a block of local rows into two subsets: those in the diagonal block and those in the off-diagonal block
6189 
6190   Input Parameters:
6191     mat: an MPI matrix that provides row and column layout information for splitting. Let's say its number of local rows is m.
6192     n,i[],j[],perm[]: there are n input entries, belonging to m rows. Row/col indices of the entries are stored in i[] and j[]
6193       respectively, along with a permutation array perm[]. Length of the i[],j[],perm[] arrays is n.
6194 
6195       i[] is already sorted, but within a row, j[] is not sorted and might have repeats.
6196       i[] might contain negative indices at the beginning, which means the corresponding entries should be ignored in the splitting.
6197 
6198   Output Parameters:
6199     j[],perm[]: the routine needs to sort j[] within each row along with perm[].
6200     rowBegin[],rowMid[],rowEnd[]: of length m, and the memory is preallocated and zeroed by the caller.
6201       They contain indices pointing to j[]. For 0<=r<m, [rowBegin[r],rowMid[r]) point to begin/end entries of row r of the diagonal block,
6202       and [rowMid[r],rowEnd[r]) point to begin/end entries of row r of the off-diagonal block.
6203 
6204     Aperm[],Ajmap[],Atot,Annz: Arrays are allocated by this routine.
6205       Atot: number of entries belonging to the diagonal block.
6206       Annz: number of unique nonzeros belonging to the diagonal block.
6207       Aperm[Atot] stores values from perm[] for entries belonging to the diagonal block. Length of Aperm[] is Atot, though it may also count
6208         repeats (i.e., same 'i,j' pair).
6209       Ajmap[Annz+1] stores the number of repeats of each unique entry belonging to the diagonal block. More precisely, Ajmap[t+1] - Ajmap[t]
6210         is the number of repeats for the t-th unique entry in the diagonal block. Ajmap[0] is always 0.
6211 
6212       Atot: number of entries belonging to the diagonal block
6213       Annz: number of unique nonzeros belonging to the diagonal block.
6214 
6215     Bperm[], Bjmap[], Btot, Bnnz are similar but for the off-diagonal block.
6216 
6217     Aperm[],Bperm[],Ajmap[] and Bjmap[] are allocated separately by this routine with PetscMalloc1().
6218 */
6219 static PetscErrorCode MatSplitEntries_Internal(Mat mat, PetscCount n, const PetscInt i[], PetscInt j[], PetscCount perm[], PetscCount rowBegin[], PetscCount rowMid[], PetscCount rowEnd[], PetscCount *Atot_, PetscCount **Aperm_, PetscCount *Annz_, PetscCount **Ajmap_, PetscCount *Btot_, PetscCount **Bperm_, PetscCount *Bnnz_, PetscCount **Bjmap_)
6220 {
6221   PetscInt    cstart, cend, rstart, rend, row, col;
6222   PetscCount  Atot = 0, Btot = 0; /* Total number of nonzeros in the diagonal and off-diagonal blocks */
6223   PetscCount  Annz = 0, Bnnz = 0; /* Number of unique nonzeros in the diagonal and off-diagonal blocks */
6224   PetscCount  k, m, p, q, r, s, mid;
6225   PetscCount *Aperm, *Bperm, *Ajmap, *Bjmap;
6226 
6227   PetscFunctionBegin;
6228   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6229   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6230   m = rend - rstart;
6231 
6232   /* Skip negative rows */
6233   for (k = 0; k < n; k++)
6234     if (i[k] >= 0) break;
6235 
6236   /* Process [k,n): sort and partition each local row into diag and offdiag portions,
6237      fill rowBegin[], rowMid[], rowEnd[], and count Atot, Btot, Annz, Bnnz.
6238   */
6239   while (k < n) {
6240     row = i[k];
6241     /* Entries in [k,s) are in one row. Shift diagonal block col indices so that diag is ahead of offdiag after sorting the row */
6242     for (s = k; s < n; s++)
6243       if (i[s] != row) break;
6244 
6245     /* Shift diag columns to range of [-PETSC_MAX_INT, -1] */
6246     for (p = k; p < s; p++) {
6247       if (j[p] >= cstart && j[p] < cend) j[p] -= PETSC_MAX_INT;
6248       else PetscAssert((j[p] >= 0) && (j[p] <= mat->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index %" PetscInt_FMT " is out of range", j[p]);
6249     }
6250     PetscCall(PetscSortIntWithCountArray(s - k, j + k, perm + k));
6251     PetscCall(PetscSortedIntUpperBound(j, k, s, -1, &mid)); /* Separate [k,s) into [k,mid) for diag and [mid,s) for offdiag */
6252     rowBegin[row - rstart] = k;
6253     rowMid[row - rstart]   = mid;
6254     rowEnd[row - rstart]   = s;
6255 
6256     /* Count nonzeros of this diag/offdiag row, which might have repeats */
6257     Atot += mid - k;
6258     Btot += s - mid;
6259 
6260     /* Count unique nonzeros of this diag row */
6261     for (p = k; p < mid;) {
6262       col = j[p];
6263       do {
6264         j[p] += PETSC_MAX_INT; /* Revert the modified diagonal indices */
6265         p++;
6266       } while (p < mid && j[p] == col);
6267       Annz++;
6268     }
6269 
6270     /* Count unique nonzeros of this offdiag row */
6271     for (p = mid; p < s;) {
6272       col = j[p];
6273       do {
6274         p++;
6275       } while (p < s && j[p] == col);
6276       Bnnz++;
6277     }
6278     k = s;
6279   }
6280 
6281   /* Allocation according to Atot, Btot, Annz, Bnnz */
6282   PetscCall(PetscMalloc1(Atot, &Aperm));
6283   PetscCall(PetscMalloc1(Btot, &Bperm));
6284   PetscCall(PetscMalloc1(Annz + 1, &Ajmap));
6285   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap));
6286 
6287   /* Re-scan indices and copy diag/offdiag permutation indices to Aperm, Bperm and also fill Ajmap and Bjmap */
6288   Ajmap[0] = Bjmap[0] = Atot = Btot = Annz = Bnnz = 0;
6289   for (r = 0; r < m; r++) {
6290     k   = rowBegin[r];
6291     mid = rowMid[r];
6292     s   = rowEnd[r];
6293     PetscCall(PetscArraycpy(Aperm + Atot, perm + k, mid - k));
6294     PetscCall(PetscArraycpy(Bperm + Btot, perm + mid, s - mid));
6295     Atot += mid - k;
6296     Btot += s - mid;
6297 
6298     /* Scan column indices in this row and find out how many repeats each unique nonzero has */
6299     for (p = k; p < mid;) {
6300       col = j[p];
6301       q   = p;
6302       do {
6303         p++;
6304       } while (p < mid && j[p] == col);
6305       Ajmap[Annz + 1] = Ajmap[Annz] + (p - q);
6306       Annz++;
6307     }
6308 
6309     for (p = mid; p < s;) {
6310       col = j[p];
6311       q   = p;
6312       do {
6313         p++;
6314       } while (p < s && j[p] == col);
6315       Bjmap[Bnnz + 1] = Bjmap[Bnnz] + (p - q);
6316       Bnnz++;
6317     }
6318   }
6319   /* Output */
6320   *Aperm_ = Aperm;
6321   *Annz_  = Annz;
6322   *Atot_  = Atot;
6323   *Ajmap_ = Ajmap;
6324   *Bperm_ = Bperm;
6325   *Bnnz_  = Bnnz;
6326   *Btot_  = Btot;
6327   *Bjmap_ = Bjmap;
6328   PetscFunctionReturn(PETSC_SUCCESS);
6329 }
6330 
6331 /*
6332   Expand the jmap[] array to make a new one in view of nonzeros in the merged matrix
6333 
6334   Input Parameters:
6335     nnz1: number of unique nonzeros in a set that was used to produce imap[], jmap[]
6336     nnz:  number of unique nonzeros in the merged matrix
6337     imap[nnz1]: i-th nonzero in the set is the imap[i]-th nonzero in the merged matrix
6338     jmap[nnz1+1]: i-th nonzero in the set has jmap[i+1] - jmap[i] repeats in the set
6339 
6340   Output Parameter: (memory is allocated by the caller)
6341     jmap_new[nnz+1]: i-th nonzero in the merged matrix has jmap_new[i+1] - jmap_new[i] repeats in the set
6342 
6343   Example:
6344     nnz1 = 4
6345     nnz  = 6
6346     imap = [1,3,4,5]
6347     jmap = [0,3,5,6,7]
6348    then,
6349     jmap_new = [0,0,3,3,5,6,7]
6350 */
6351 static PetscErrorCode ExpandJmap_Internal(PetscCount nnz1, PetscCount nnz, const PetscCount imap[], const PetscCount jmap[], PetscCount jmap_new[])
6352 {
6353   PetscCount k, p;
6354 
6355   PetscFunctionBegin;
6356   jmap_new[0] = 0;
6357   p           = nnz;                /* p loops over jmap_new[] backwards */
6358   for (k = nnz1 - 1; k >= 0; k--) { /* k loops over imap[] */
6359     for (; p > imap[k]; p--) jmap_new[p] = jmap[k + 1];
6360   }
6361   for (; p >= 0; p--) jmap_new[p] = jmap[0];
6362   PetscFunctionReturn(PETSC_SUCCESS);
6363 }
6364 
6365 static PetscErrorCode MatCOOStructDestroy_MPIAIJ(void *data)
6366 {
6367   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data;
6368 
6369   PetscFunctionBegin;
6370   PetscCall(PetscSFDestroy(&coo->sf));
6371   PetscCall(PetscFree(coo->Aperm1));
6372   PetscCall(PetscFree(coo->Bperm1));
6373   PetscCall(PetscFree(coo->Ajmap1));
6374   PetscCall(PetscFree(coo->Bjmap1));
6375   PetscCall(PetscFree(coo->Aimap2));
6376   PetscCall(PetscFree(coo->Bimap2));
6377   PetscCall(PetscFree(coo->Aperm2));
6378   PetscCall(PetscFree(coo->Bperm2));
6379   PetscCall(PetscFree(coo->Ajmap2));
6380   PetscCall(PetscFree(coo->Bjmap2));
6381   PetscCall(PetscFree(coo->Cperm1));
6382   PetscCall(PetscFree2(coo->sendbuf, coo->recvbuf));
6383   PetscCall(PetscFree(coo));
6384   PetscFunctionReturn(PETSC_SUCCESS);
6385 }
6386 
6387 PetscErrorCode MatSetPreallocationCOO_MPIAIJ(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
6388 {
6389   MPI_Comm             comm;
6390   PetscMPIInt          rank, size;
6391   PetscInt             m, n, M, N, rstart, rend, cstart, cend; /* Sizes, indices of row/col, therefore with type PetscInt */
6392   PetscCount           k, p, q, rem;                           /* Loop variables over coo arrays */
6393   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6394   PetscContainer       container;
6395   MatCOOStruct_MPIAIJ *coo;
6396 
6397   PetscFunctionBegin;
6398   PetscCall(PetscFree(mpiaij->garray));
6399   PetscCall(VecDestroy(&mpiaij->lvec));
6400 #if defined(PETSC_USE_CTABLE)
6401   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
6402 #else
6403   PetscCall(PetscFree(mpiaij->colmap));
6404 #endif
6405   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
6406   mat->assembled     = PETSC_FALSE;
6407   mat->was_assembled = PETSC_FALSE;
6408 
6409   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
6410   PetscCallMPI(MPI_Comm_size(comm, &size));
6411   PetscCallMPI(MPI_Comm_rank(comm, &rank));
6412   PetscCall(PetscLayoutSetUp(mat->rmap));
6413   PetscCall(PetscLayoutSetUp(mat->cmap));
6414   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6415   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6416   PetscCall(MatGetLocalSize(mat, &m, &n));
6417   PetscCall(MatGetSize(mat, &M, &N));
6418 
6419   /* Sort (i,j) by row along with a permutation array, so that the to-be-ignored */
6420   /* entries come first, then local rows, then remote rows.                     */
6421   PetscCount n1 = coo_n, *perm1;
6422   PetscInt  *i1 = coo_i, *j1 = coo_j;
6423 
6424   PetscCall(PetscMalloc1(n1, &perm1));
6425   for (k = 0; k < n1; k++) perm1[k] = k;
6426 
6427   /* Manipulate indices so that entries with negative row or col indices will have smallest
6428      row indices, local entries will have greater but negative row indices, and remote entries
6429      will have positive row indices.
6430   */
6431   for (k = 0; k < n1; k++) {
6432     if (i1[k] < 0 || j1[k] < 0) i1[k] = PETSC_MIN_INT;                /* e.g., -2^31, minimal to move them ahead */
6433     else if (i1[k] >= rstart && i1[k] < rend) i1[k] -= PETSC_MAX_INT; /* e.g., minus 2^31-1 to shift local rows to range of [-PETSC_MAX_INT, -1] */
6434     else {
6435       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "MAT_NO_OFF_PROC_ENTRIES is set but insert to remote rows");
6436       if (mpiaij->donotstash) i1[k] = PETSC_MIN_INT; /* Ignore offproc entries as if they had negative indices */
6437     }
6438   }
6439 
6440   /* Sort by row; after that, [0,k) have ignored entries, [k,rem) have local rows and [rem,n1) have remote rows */
6441   PetscCall(PetscSortIntWithIntCountArrayPair(n1, i1, j1, perm1));
6442 
6443   /* Advance k to the first entry we need to take care of */
6444   for (k = 0; k < n1; k++)
6445     if (i1[k] > PETSC_MIN_INT) break;
6446   PetscInt i1start = k;
6447 
6448   PetscCall(PetscSortedIntUpperBound(i1, k, n1, rend - 1 - PETSC_MAX_INT, &rem)); /* rem is upper bound of the last local row */
6449   for (; k < rem; k++) i1[k] += PETSC_MAX_INT;                                    /* Revert row indices of local rows*/
6450 
6451   /*           Send remote rows to their owner                                  */
6452   /* Find which rows should be sent to which remote ranks*/
6453   PetscInt        nsend = 0; /* Number of MPI ranks to send data to */
6454   PetscMPIInt    *sendto;    /* [nsend], storing remote ranks */
6455   PetscInt       *nentries;  /* [nsend], storing number of entries sent to remote ranks; Assume PetscInt is big enough for this count, and error if not */
6456   const PetscInt *ranges;
6457   PetscInt        maxNsend = size >= 128 ? 128 : size; /* Assume max 128 neighbors; realloc when needed */
6458 
6459   PetscCall(PetscLayoutGetRanges(mat->rmap, &ranges));
6460   PetscCall(PetscMalloc2(maxNsend, &sendto, maxNsend, &nentries));
6461   for (k = rem; k < n1;) {
6462     PetscMPIInt owner;
6463     PetscInt    firstRow, lastRow;
6464 
6465     /* Locate a row range */
6466     firstRow = i1[k]; /* first row of this owner */
6467     PetscCall(PetscLayoutFindOwner(mat->rmap, firstRow, &owner));
6468     lastRow = ranges[owner + 1] - 1; /* last row of this owner */
6469 
6470     /* Find the first index 'p' in [k,n) with i[p] belonging to next owner */
6471     PetscCall(PetscSortedIntUpperBound(i1, k, n1, lastRow, &p));
6472 
6473     /* All entries in [k,p) belong to this remote owner */
6474     if (nsend >= maxNsend) { /* Double the remote ranks arrays if not long enough */
6475       PetscMPIInt *sendto2;
6476       PetscInt    *nentries2;
6477       PetscInt     maxNsend2 = (maxNsend <= size / 2) ? maxNsend * 2 : size;
6478 
6479       PetscCall(PetscMalloc2(maxNsend2, &sendto2, maxNsend2, &nentries2));
6480       PetscCall(PetscArraycpy(sendto2, sendto, maxNsend));
6481       PetscCall(PetscArraycpy(nentries2, nentries2, maxNsend + 1));
6482       PetscCall(PetscFree2(sendto, nentries2));
6483       sendto   = sendto2;
6484       nentries = nentries2;
6485       maxNsend = maxNsend2;
6486     }
6487     sendto[nsend]   = owner;
6488     nentries[nsend] = p - k;
6489     PetscCall(PetscCountCast(p - k, &nentries[nsend]));
6490     nsend++;
6491     k = p;
6492   }
6493 
6494   /* Build 1st SF to know offsets on remote to send data */
6495   PetscSF      sf1;
6496   PetscInt     nroots = 1, nroots2 = 0;
6497   PetscInt     nleaves = nsend, nleaves2 = 0;
6498   PetscInt    *offsets;
6499   PetscSFNode *iremote;
6500 
6501   PetscCall(PetscSFCreate(comm, &sf1));
6502   PetscCall(PetscMalloc1(nsend, &iremote));
6503   PetscCall(PetscMalloc1(nsend, &offsets));
6504   for (k = 0; k < nsend; k++) {
6505     iremote[k].rank  = sendto[k];
6506     iremote[k].index = 0;
6507     nleaves2 += nentries[k];
6508     PetscCheck(nleaves2 >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF leaves is too large for PetscInt");
6509   }
6510   PetscCall(PetscSFSetGraph(sf1, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6511   PetscCall(PetscSFFetchAndOpWithMemTypeBegin(sf1, MPIU_INT, PETSC_MEMTYPE_HOST, &nroots2 /*rootdata*/, PETSC_MEMTYPE_HOST, nentries /*leafdata*/, PETSC_MEMTYPE_HOST, offsets /*leafupdate*/, MPI_SUM));
6512   PetscCall(PetscSFFetchAndOpEnd(sf1, MPIU_INT, &nroots2, nentries, offsets, MPI_SUM)); /* Would nroots2 overflow, we check offsets[] below */
6513   PetscCall(PetscSFDestroy(&sf1));
6514   PetscAssert(nleaves2 == n1 - rem, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nleaves2 %" PetscInt_FMT " != number of remote entries %" PetscCount_FMT "", nleaves2, n1 - rem);
6515 
6516   /* Build 2nd SF to send remote COOs to their owner */
6517   PetscSF sf2;
6518   nroots  = nroots2;
6519   nleaves = nleaves2;
6520   PetscCall(PetscSFCreate(comm, &sf2));
6521   PetscCall(PetscSFSetFromOptions(sf2));
6522   PetscCall(PetscMalloc1(nleaves, &iremote));
6523   p = 0;
6524   for (k = 0; k < nsend; k++) {
6525     PetscCheck(offsets[k] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF roots is too large for PetscInt");
6526     for (q = 0; q < nentries[k]; q++, p++) {
6527       iremote[p].rank  = sendto[k];
6528       iremote[p].index = offsets[k] + q;
6529     }
6530   }
6531   PetscCall(PetscSFSetGraph(sf2, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6532 
6533   /* Send the remote COOs to their owner */
6534   PetscInt    n2 = nroots, *i2, *j2; /* Buffers for received COOs from other ranks, along with a permutation array */
6535   PetscCount *perm2;                 /* Though PetscInt is enough for remote entries, we use PetscCount here as we want to reuse MatSplitEntries_Internal() */
6536   PetscCall(PetscMalloc3(n2, &i2, n2, &j2, n2, &perm2));
6537   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, i1 + rem, PETSC_MEMTYPE_HOST, i2, MPI_REPLACE));
6538   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, i1 + rem, i2, MPI_REPLACE));
6539   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, j1 + rem, PETSC_MEMTYPE_HOST, j2, MPI_REPLACE));
6540   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, j1 + rem, j2, MPI_REPLACE));
6541 
6542   PetscCall(PetscFree(offsets));
6543   PetscCall(PetscFree2(sendto, nentries));
6544 
6545   /* Sort received COOs by row along with the permutation array     */
6546   for (k = 0; k < n2; k++) perm2[k] = k;
6547   PetscCall(PetscSortIntWithIntCountArrayPair(n2, i2, j2, perm2));
6548 
6549   /* sf2 only sends contiguous leafdata to contiguous rootdata. We record the permutation which will be used to fill leafdata */
6550   PetscCount *Cperm1;
6551   PetscCall(PetscMalloc1(nleaves, &Cperm1));
6552   PetscCall(PetscArraycpy(Cperm1, perm1 + rem, nleaves));
6553 
6554   /* Support for HYPRE matrices, kind of a hack.
6555      Swap min column with diagonal so that diagonal values will go first */
6556   PetscBool   hypre;
6557   const char *name;
6558   PetscCall(PetscObjectGetName((PetscObject)mat, &name));
6559   PetscCall(PetscStrcmp("_internal_COO_mat_for_hypre", name, &hypre));
6560   if (hypre) {
6561     PetscInt *minj;
6562     PetscBT   hasdiag;
6563 
6564     PetscCall(PetscBTCreate(m, &hasdiag));
6565     PetscCall(PetscMalloc1(m, &minj));
6566     for (k = 0; k < m; k++) minj[k] = PETSC_MAX_INT;
6567     for (k = i1start; k < rem; k++) {
6568       if (j1[k] < cstart || j1[k] >= cend) continue;
6569       const PetscInt rindex = i1[k] - rstart;
6570       if ((j1[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6571       minj[rindex] = PetscMin(minj[rindex], j1[k]);
6572     }
6573     for (k = 0; k < n2; k++) {
6574       if (j2[k] < cstart || j2[k] >= cend) continue;
6575       const PetscInt rindex = i2[k] - rstart;
6576       if ((j2[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6577       minj[rindex] = PetscMin(minj[rindex], j2[k]);
6578     }
6579     for (k = i1start; k < rem; k++) {
6580       const PetscInt rindex = i1[k] - rstart;
6581       if (j1[k] < cstart || j1[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6582       if (j1[k] == minj[rindex]) j1[k] = i1[k] + (cstart - rstart);
6583       else if ((j1[k] - cstart) == rindex) j1[k] = minj[rindex];
6584     }
6585     for (k = 0; k < n2; k++) {
6586       const PetscInt rindex = i2[k] - rstart;
6587       if (j2[k] < cstart || j2[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6588       if (j2[k] == minj[rindex]) j2[k] = i2[k] + (cstart - rstart);
6589       else if ((j2[k] - cstart) == rindex) j2[k] = minj[rindex];
6590     }
6591     PetscCall(PetscBTDestroy(&hasdiag));
6592     PetscCall(PetscFree(minj));
6593   }
6594 
6595   /* Split local COOs and received COOs into diag/offdiag portions */
6596   PetscCount *rowBegin1, *rowMid1, *rowEnd1;
6597   PetscCount *Ajmap1, *Aperm1, *Bjmap1, *Bperm1;
6598   PetscCount  Annz1, Bnnz1, Atot1, Btot1;
6599   PetscCount *rowBegin2, *rowMid2, *rowEnd2;
6600   PetscCount *Ajmap2, *Aperm2, *Bjmap2, *Bperm2;
6601   PetscCount  Annz2, Bnnz2, Atot2, Btot2;
6602 
6603   PetscCall(PetscCalloc3(m, &rowBegin1, m, &rowMid1, m, &rowEnd1));
6604   PetscCall(PetscCalloc3(m, &rowBegin2, m, &rowMid2, m, &rowEnd2));
6605   PetscCall(MatSplitEntries_Internal(mat, rem, i1, j1, perm1, rowBegin1, rowMid1, rowEnd1, &Atot1, &Aperm1, &Annz1, &Ajmap1, &Btot1, &Bperm1, &Bnnz1, &Bjmap1));
6606   PetscCall(MatSplitEntries_Internal(mat, n2, i2, j2, perm2, rowBegin2, rowMid2, rowEnd2, &Atot2, &Aperm2, &Annz2, &Ajmap2, &Btot2, &Bperm2, &Bnnz2, &Bjmap2));
6607 
6608   /* Merge local COOs with received COOs: diag with diag, offdiag with offdiag */
6609   PetscInt *Ai, *Bi;
6610   PetscInt *Aj, *Bj;
6611 
6612   PetscCall(PetscMalloc1(m + 1, &Ai));
6613   PetscCall(PetscMalloc1(m + 1, &Bi));
6614   PetscCall(PetscMalloc1(Annz1 + Annz2, &Aj)); /* Since local and remote entries might have dups, we might allocate excess memory */
6615   PetscCall(PetscMalloc1(Bnnz1 + Bnnz2, &Bj));
6616 
6617   PetscCount *Aimap1, *Bimap1, *Aimap2, *Bimap2;
6618   PetscCall(PetscMalloc1(Annz1, &Aimap1));
6619   PetscCall(PetscMalloc1(Bnnz1, &Bimap1));
6620   PetscCall(PetscMalloc1(Annz2, &Aimap2));
6621   PetscCall(PetscMalloc1(Bnnz2, &Bimap2));
6622 
6623   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowBegin1, rowMid1, rowBegin2, rowMid2, Ajmap1, Ajmap2, Aimap1, Aimap2, Ai, Aj));
6624   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowMid1, rowEnd1, rowMid2, rowEnd2, Bjmap1, Bjmap2, Bimap1, Bimap2, Bi, Bj));
6625 
6626   /* Expand Ajmap1/Bjmap1 to make them based off nonzeros in A/B, since we     */
6627   /* expect nonzeros in A/B most likely have local contributing entries        */
6628   PetscInt    Annz = Ai[m];
6629   PetscInt    Bnnz = Bi[m];
6630   PetscCount *Ajmap1_new, *Bjmap1_new;
6631 
6632   PetscCall(PetscMalloc1(Annz + 1, &Ajmap1_new));
6633   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap1_new));
6634 
6635   PetscCall(ExpandJmap_Internal(Annz1, Annz, Aimap1, Ajmap1, Ajmap1_new));
6636   PetscCall(ExpandJmap_Internal(Bnnz1, Bnnz, Bimap1, Bjmap1, Bjmap1_new));
6637 
6638   PetscCall(PetscFree(Aimap1));
6639   PetscCall(PetscFree(Ajmap1));
6640   PetscCall(PetscFree(Bimap1));
6641   PetscCall(PetscFree(Bjmap1));
6642   PetscCall(PetscFree3(rowBegin1, rowMid1, rowEnd1));
6643   PetscCall(PetscFree3(rowBegin2, rowMid2, rowEnd2));
6644   PetscCall(PetscFree(perm1));
6645   PetscCall(PetscFree3(i2, j2, perm2));
6646 
6647   Ajmap1 = Ajmap1_new;
6648   Bjmap1 = Bjmap1_new;
6649 
6650   /* Reallocate Aj, Bj once we know actual numbers of unique nonzeros in A and B */
6651   if (Annz < Annz1 + Annz2) {
6652     PetscInt *Aj_new;
6653     PetscCall(PetscMalloc1(Annz, &Aj_new));
6654     PetscCall(PetscArraycpy(Aj_new, Aj, Annz));
6655     PetscCall(PetscFree(Aj));
6656     Aj = Aj_new;
6657   }
6658 
6659   if (Bnnz < Bnnz1 + Bnnz2) {
6660     PetscInt *Bj_new;
6661     PetscCall(PetscMalloc1(Bnnz, &Bj_new));
6662     PetscCall(PetscArraycpy(Bj_new, Bj, Bnnz));
6663     PetscCall(PetscFree(Bj));
6664     Bj = Bj_new;
6665   }
6666 
6667   /* Create new submatrices for on-process and off-process coupling                  */
6668   PetscScalar     *Aa, *Ba;
6669   MatType          rtype;
6670   Mat_SeqAIJ      *a, *b;
6671   PetscObjectState state;
6672   PetscCall(PetscCalloc1(Annz, &Aa)); /* Zero matrix on device */
6673   PetscCall(PetscCalloc1(Bnnz, &Ba));
6674   /* make Aj[] local, i.e, based off the start column of the diagonal portion */
6675   if (cstart) {
6676     for (k = 0; k < Annz; k++) Aj[k] -= cstart;
6677   }
6678   PetscCall(MatDestroy(&mpiaij->A));
6679   PetscCall(MatDestroy(&mpiaij->B));
6680   PetscCall(MatGetRootType_Private(mat, &rtype));
6681   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, Ai, Aj, Aa, &mpiaij->A));
6682   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, mat->cmap->N, Bi, Bj, Ba, &mpiaij->B));
6683   PetscCall(MatSetUpMultiply_MPIAIJ(mat));
6684   mat->was_assembled = PETSC_TRUE; // was_assembled in effect means the Mvctx is built; doing so avoids redundant MatSetUpMultiply_MPIAIJ
6685   state              = mpiaij->A->nonzerostate + mpiaij->B->nonzerostate;
6686   PetscCall(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
6687 
6688   a               = (Mat_SeqAIJ *)mpiaij->A->data;
6689   b               = (Mat_SeqAIJ *)mpiaij->B->data;
6690   a->singlemalloc = b->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa,Bi,Bj,Ba */
6691   a->free_a = b->free_a = PETSC_TRUE;
6692   a->free_ij = b->free_ij = PETSC_TRUE;
6693 
6694   /* conversion must happen AFTER multiply setup */
6695   PetscCall(MatConvert(mpiaij->A, rtype, MAT_INPLACE_MATRIX, &mpiaij->A));
6696   PetscCall(MatConvert(mpiaij->B, rtype, MAT_INPLACE_MATRIX, &mpiaij->B));
6697   PetscCall(VecDestroy(&mpiaij->lvec));
6698   PetscCall(MatCreateVecs(mpiaij->B, &mpiaij->lvec, NULL));
6699 
6700   // Put the COO struct in a container and then attach that to the matrix
6701   PetscCall(PetscMalloc1(1, &coo));
6702   coo->n       = coo_n;
6703   coo->sf      = sf2;
6704   coo->sendlen = nleaves;
6705   coo->recvlen = nroots;
6706   coo->Annz    = Annz;
6707   coo->Bnnz    = Bnnz;
6708   coo->Annz2   = Annz2;
6709   coo->Bnnz2   = Bnnz2;
6710   coo->Atot1   = Atot1;
6711   coo->Atot2   = Atot2;
6712   coo->Btot1   = Btot1;
6713   coo->Btot2   = Btot2;
6714   coo->Ajmap1  = Ajmap1;
6715   coo->Aperm1  = Aperm1;
6716   coo->Bjmap1  = Bjmap1;
6717   coo->Bperm1  = Bperm1;
6718   coo->Aimap2  = Aimap2;
6719   coo->Ajmap2  = Ajmap2;
6720   coo->Aperm2  = Aperm2;
6721   coo->Bimap2  = Bimap2;
6722   coo->Bjmap2  = Bjmap2;
6723   coo->Bperm2  = Bperm2;
6724   coo->Cperm1  = Cperm1;
6725   // Allocate in preallocation. If not used, it has zero cost on host
6726   PetscCall(PetscMalloc2(coo->sendlen, &coo->sendbuf, coo->recvlen, &coo->recvbuf));
6727   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
6728   PetscCall(PetscContainerSetPointer(container, coo));
6729   PetscCall(PetscContainerSetUserDestroy(container, MatCOOStructDestroy_MPIAIJ));
6730   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject)container));
6731   PetscCall(PetscContainerDestroy(&container));
6732   PetscFunctionReturn(PETSC_SUCCESS);
6733 }
6734 
6735 static PetscErrorCode MatSetValuesCOO_MPIAIJ(Mat mat, const PetscScalar v[], InsertMode imode)
6736 {
6737   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6738   Mat                  A = mpiaij->A, B = mpiaij->B;
6739   PetscScalar         *Aa, *Ba;
6740   PetscScalar         *sendbuf, *recvbuf;
6741   const PetscCount    *Ajmap1, *Ajmap2, *Aimap2;
6742   const PetscCount    *Bjmap1, *Bjmap2, *Bimap2;
6743   const PetscCount    *Aperm1, *Aperm2, *Bperm1, *Bperm2;
6744   const PetscCount    *Cperm1;
6745   PetscContainer       container;
6746   MatCOOStruct_MPIAIJ *coo;
6747 
6748   PetscFunctionBegin;
6749   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container));
6750   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
6751   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
6752   sendbuf = coo->sendbuf;
6753   recvbuf = coo->recvbuf;
6754   Ajmap1  = coo->Ajmap1;
6755   Ajmap2  = coo->Ajmap2;
6756   Aimap2  = coo->Aimap2;
6757   Bjmap1  = coo->Bjmap1;
6758   Bjmap2  = coo->Bjmap2;
6759   Bimap2  = coo->Bimap2;
6760   Aperm1  = coo->Aperm1;
6761   Aperm2  = coo->Aperm2;
6762   Bperm1  = coo->Bperm1;
6763   Bperm2  = coo->Bperm2;
6764   Cperm1  = coo->Cperm1;
6765 
6766   PetscCall(MatSeqAIJGetArray(A, &Aa)); /* Might read and write matrix values */
6767   PetscCall(MatSeqAIJGetArray(B, &Ba));
6768 
6769   /* Pack entries to be sent to remote */
6770   for (PetscCount i = 0; i < coo->sendlen; i++) sendbuf[i] = v[Cperm1[i]];
6771 
6772   /* Send remote entries to their owner and overlap the communication with local computation */
6773   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_HOST, sendbuf, PETSC_MEMTYPE_HOST, recvbuf, MPI_REPLACE));
6774   /* Add local entries to A and B */
6775   for (PetscCount i = 0; i < coo->Annz; i++) { /* All nonzeros in A are either zero'ed or added with a value (i.e., initialized) */
6776     PetscScalar sum = 0.0;                     /* Do partial summation first to improve numerical stability */
6777     for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += v[Aperm1[k]];
6778     Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
6779   }
6780   for (PetscCount i = 0; i < coo->Bnnz; i++) {
6781     PetscScalar sum = 0.0;
6782     for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += v[Bperm1[k]];
6783     Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
6784   }
6785   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, sendbuf, recvbuf, MPI_REPLACE));
6786 
6787   /* Add received remote entries to A and B */
6788   for (PetscCount i = 0; i < coo->Annz2; i++) {
6789     for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += recvbuf[Aperm2[k]];
6790   }
6791   for (PetscCount i = 0; i < coo->Bnnz2; i++) {
6792     for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += recvbuf[Bperm2[k]];
6793   }
6794   PetscCall(MatSeqAIJRestoreArray(A, &Aa));
6795   PetscCall(MatSeqAIJRestoreArray(B, &Ba));
6796   PetscFunctionReturn(PETSC_SUCCESS);
6797 }
6798 
6799 /*MC
6800    MATMPIAIJ - MATMPIAIJ = "mpiaij" - A matrix type to be used for parallel sparse matrices.
6801 
6802    Options Database Keys:
6803 . -mat_type mpiaij - sets the matrix type to `MATMPIAIJ` during a call to `MatSetFromOptions()`
6804 
6805    Level: beginner
6806 
6807    Notes:
6808    `MatSetValues()` may be called for this matrix type with a `NULL` argument for the numerical values,
6809     in this case the values associated with the rows and columns one passes in are set to zero
6810     in the matrix
6811 
6812     `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
6813     space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
6814 
6815 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJ`, `MATAIJ`, `MatCreateAIJ()`
6816 M*/
6817 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJ(Mat B)
6818 {
6819   Mat_MPIAIJ *b;
6820   PetscMPIInt size;
6821 
6822   PetscFunctionBegin;
6823   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
6824 
6825   PetscCall(PetscNew(&b));
6826   B->data       = (void *)b;
6827   B->ops[0]     = MatOps_Values;
6828   B->assembled  = PETSC_FALSE;
6829   B->insertmode = NOT_SET_VALUES;
6830   b->size       = size;
6831 
6832   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B), &b->rank));
6833 
6834   /* build cache for off array entries formed */
6835   PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)B), 1, &B->stash));
6836 
6837   b->donotstash  = PETSC_FALSE;
6838   b->colmap      = NULL;
6839   b->garray      = NULL;
6840   b->roworiented = PETSC_TRUE;
6841 
6842   /* stuff used for matrix vector multiply */
6843   b->lvec  = NULL;
6844   b->Mvctx = NULL;
6845 
6846   /* stuff for MatGetRow() */
6847   b->rowindices   = NULL;
6848   b->rowvalues    = NULL;
6849   b->getrowactive = PETSC_FALSE;
6850 
6851   /* flexible pointer used in CUSPARSE classes */
6852   b->spptr = NULL;
6853 
6854   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetUseScalableIncreaseOverlap_C", MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ));
6855   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_MPIAIJ));
6856   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_MPIAIJ));
6857   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_MPIAIJ));
6858   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJ));
6859   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetPreallocation_C", MatResetPreallocation_MPIAIJ));
6860   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocationCSR_C", MatMPIAIJSetPreallocationCSR_MPIAIJ));
6861   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDiagonalScaleLocal_C", MatDiagonalScaleLocal_MPIAIJ));
6862   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijperm_C", MatConvert_MPIAIJ_MPIAIJPERM));
6863   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijsell_C", MatConvert_MPIAIJ_MPIAIJSELL));
6864 #if defined(PETSC_HAVE_CUDA)
6865   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcusparse_C", MatConvert_MPIAIJ_MPIAIJCUSPARSE));
6866 #endif
6867 #if defined(PETSC_HAVE_HIP)
6868   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijhipsparse_C", MatConvert_MPIAIJ_MPIAIJHIPSPARSE));
6869 #endif
6870 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6871   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijkokkos_C", MatConvert_MPIAIJ_MPIAIJKokkos));
6872 #endif
6873 #if defined(PETSC_HAVE_MKL_SPARSE)
6874   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijmkl_C", MatConvert_MPIAIJ_MPIAIJMKL));
6875 #endif
6876   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcrl_C", MatConvert_MPIAIJ_MPIAIJCRL));
6877   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpibaij_C", MatConvert_MPIAIJ_MPIBAIJ));
6878   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisbaij_C", MatConvert_MPIAIJ_MPISBAIJ));
6879   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpidense_C", MatConvert_MPIAIJ_MPIDense));
6880 #if defined(PETSC_HAVE_ELEMENTAL)
6881   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_elemental_C", MatConvert_MPIAIJ_Elemental));
6882 #endif
6883 #if defined(PETSC_HAVE_SCALAPACK)
6884   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_scalapack_C", MatConvert_AIJ_ScaLAPACK));
6885 #endif
6886   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_is_C", MatConvert_XAIJ_IS));
6887   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisell_C", MatConvert_MPIAIJ_MPISELL));
6888 #if defined(PETSC_HAVE_HYPRE)
6889   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_hypre_C", MatConvert_AIJ_HYPRE));
6890   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", MatProductSetFromOptions_Transpose_AIJ_AIJ));
6891 #endif
6892   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_is_mpiaij_C", MatProductSetFromOptions_IS_XAIJ));
6893   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_mpiaij_mpiaij_C", MatProductSetFromOptions_MPIAIJ));
6894   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJ));
6895   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJ));
6896   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJ));
6897   PetscFunctionReturn(PETSC_SUCCESS);
6898 }
6899 
6900 /*@C
6901   MatCreateMPIAIJWithSplitArrays - creates a `MATMPIAIJ` matrix using arrays that contain the "diagonal"
6902   and "off-diagonal" part of the matrix in CSR format.
6903 
6904   Collective
6905 
6906   Input Parameters:
6907 + comm - MPI communicator
6908 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
6909 . n    - This value should be the same as the local size used in creating the
6910        x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
6911        calculated if `N` is given) For square matrices `n` is almost always `m`.
6912 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
6913 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
6914 . i    - row indices for "diagonal" portion of matrix; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
6915 . j    - column indices, which must be local, i.e., based off the start column of the diagonal portion
6916 . a    - matrix values
6917 . oi   - row indices for "off-diagonal" portion of matrix; that is oi[0] = 0, oi[row] = oi[row-1] + number of elements in that row of the matrix
6918 . oj   - column indices, which must be global, representing global columns in the `MATMPIAIJ` matrix
6919 - oa   - matrix values
6920 
6921   Output Parameter:
6922 . mat - the matrix
6923 
6924   Level: advanced
6925 
6926   Notes:
6927   The `i`, `j`, and `a` arrays ARE NOT copied by this routine into the internal format used by PETSc. The user
6928   must free the arrays once the matrix has been destroyed and not before.
6929 
6930   The `i` and `j` indices are 0 based
6931 
6932   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix
6933 
6934   This sets local rows and cannot be used to set off-processor values.
6935 
6936   Use of this routine is discouraged because it is inflexible and cumbersome to use. It is extremely rare that a
6937   legacy application natively assembles into exactly this split format. The code to do so is nontrivial and does
6938   not easily support in-place reassembly. It is recommended to use MatSetValues() (or a variant thereof) because
6939   the resulting assembly is easier to implement, will work with any matrix format, and the user does not have to
6940   keep track of the underlying array. Use `MatSetOption`(A,`MAT_NO_OFF_PROC_ENTRIES`,`PETSC_TRUE`) to disable all
6941   communication if it is known that only local entries will be set.
6942 
6943 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
6944           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithArrays()`
6945 @*/
6946 PetscErrorCode MatCreateMPIAIJWithSplitArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt i[], PetscInt j[], PetscScalar a[], PetscInt oi[], PetscInt oj[], PetscScalar oa[], Mat *mat)
6947 {
6948   Mat_MPIAIJ *maij;
6949 
6950   PetscFunctionBegin;
6951   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
6952   PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
6953   PetscCheck(oi[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "oi (row indices) must start with 0");
6954   PetscCall(MatCreate(comm, mat));
6955   PetscCall(MatSetSizes(*mat, m, n, M, N));
6956   PetscCall(MatSetType(*mat, MATMPIAIJ));
6957   maij = (Mat_MPIAIJ *)(*mat)->data;
6958 
6959   (*mat)->preallocated = PETSC_TRUE;
6960 
6961   PetscCall(PetscLayoutSetUp((*mat)->rmap));
6962   PetscCall(PetscLayoutSetUp((*mat)->cmap));
6963 
6964   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, i, j, a, &maij->A));
6965   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, (*mat)->cmap->N, oi, oj, oa, &maij->B));
6966 
6967   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
6968   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
6969   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
6970   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
6971   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
6972   PetscFunctionReturn(PETSC_SUCCESS);
6973 }
6974 
6975 typedef struct {
6976   Mat       *mp;    /* intermediate products */
6977   PetscBool *mptmp; /* is the intermediate product temporary ? */
6978   PetscInt   cp;    /* number of intermediate products */
6979 
6980   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
6981   PetscInt    *startsj_s, *startsj_r;
6982   PetscScalar *bufa;
6983   Mat          P_oth;
6984 
6985   /* may take advantage of merging product->B */
6986   Mat Bloc; /* B-local by merging diag and off-diag */
6987 
6988   /* cusparse does not have support to split between symbolic and numeric phases.
6989      When api_user is true, we don't need to update the numerical values
6990      of the temporary storage */
6991   PetscBool reusesym;
6992 
6993   /* support for COO values insertion */
6994   PetscScalar *coo_v, *coo_w; /* store on-process and off-process COO scalars, and used as MPI recv/send buffers respectively */
6995   PetscInt   **own;           /* own[i] points to address of on-process COO indices for Mat mp[i] */
6996   PetscInt   **off;           /* off[i] points to address of off-process COO indices for Mat mp[i] */
6997   PetscBool    hasoffproc;    /* if true, have off-process values insertion (i.e. AtB or PtAP) */
6998   PetscSF      sf;            /* used for non-local values insertion and memory malloc */
6999   PetscMemType mtype;
7000 
7001   /* customization */
7002   PetscBool abmerge;
7003   PetscBool P_oth_bind;
7004 } MatMatMPIAIJBACKEND;
7005 
7006 static PetscErrorCode MatDestroy_MatMatMPIAIJBACKEND(void *data)
7007 {
7008   MatMatMPIAIJBACKEND *mmdata = (MatMatMPIAIJBACKEND *)data;
7009   PetscInt             i;
7010 
7011   PetscFunctionBegin;
7012   PetscCall(PetscFree2(mmdata->startsj_s, mmdata->startsj_r));
7013   PetscCall(PetscFree(mmdata->bufa));
7014   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_v));
7015   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_w));
7016   PetscCall(MatDestroy(&mmdata->P_oth));
7017   PetscCall(MatDestroy(&mmdata->Bloc));
7018   PetscCall(PetscSFDestroy(&mmdata->sf));
7019   for (i = 0; i < mmdata->cp; i++) PetscCall(MatDestroy(&mmdata->mp[i]));
7020   PetscCall(PetscFree2(mmdata->mp, mmdata->mptmp));
7021   PetscCall(PetscFree(mmdata->own[0]));
7022   PetscCall(PetscFree(mmdata->own));
7023   PetscCall(PetscFree(mmdata->off[0]));
7024   PetscCall(PetscFree(mmdata->off));
7025   PetscCall(PetscFree(mmdata));
7026   PetscFunctionReturn(PETSC_SUCCESS);
7027 }
7028 
7029 /* Copy selected n entries with indices in idx[] of A to v[].
7030    If idx is NULL, copy the whole data array of A to v[]
7031  */
7032 static PetscErrorCode MatSeqAIJCopySubArray(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
7033 {
7034   PetscErrorCode (*f)(Mat, PetscInt, const PetscInt[], PetscScalar[]);
7035 
7036   PetscFunctionBegin;
7037   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", &f));
7038   if (f) {
7039     PetscCall((*f)(A, n, idx, v));
7040   } else {
7041     const PetscScalar *vv;
7042 
7043     PetscCall(MatSeqAIJGetArrayRead(A, &vv));
7044     if (n && idx) {
7045       PetscScalar    *w  = v;
7046       const PetscInt *oi = idx;
7047       PetscInt        j;
7048 
7049       for (j = 0; j < n; j++) *w++ = vv[*oi++];
7050     } else {
7051       PetscCall(PetscArraycpy(v, vv, n));
7052     }
7053     PetscCall(MatSeqAIJRestoreArrayRead(A, &vv));
7054   }
7055   PetscFunctionReturn(PETSC_SUCCESS);
7056 }
7057 
7058 static PetscErrorCode MatProductNumeric_MPIAIJBACKEND(Mat C)
7059 {
7060   MatMatMPIAIJBACKEND *mmdata;
7061   PetscInt             i, n_d, n_o;
7062 
7063   PetscFunctionBegin;
7064   MatCheckProduct(C, 1);
7065   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
7066   mmdata = (MatMatMPIAIJBACKEND *)C->product->data;
7067   if (!mmdata->reusesym) { /* update temporary matrices */
7068     if (mmdata->P_oth) PetscCall(MatGetBrowsOfAoCols_MPIAIJ(C->product->A, C->product->B, MAT_REUSE_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7069     if (mmdata->Bloc) PetscCall(MatMPIAIJGetLocalMatMerge(C->product->B, MAT_REUSE_MATRIX, NULL, &mmdata->Bloc));
7070   }
7071   mmdata->reusesym = PETSC_FALSE;
7072 
7073   for (i = 0; i < mmdata->cp; i++) {
7074     PetscCheck(mmdata->mp[i]->ops->productnumeric, PetscObjectComm((PetscObject)mmdata->mp[i]), PETSC_ERR_PLIB, "Missing numeric op for %s", MatProductTypes[mmdata->mp[i]->product->type]);
7075     PetscCall((*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]));
7076   }
7077   for (i = 0, n_d = 0, n_o = 0; i < mmdata->cp; i++) {
7078     PetscInt noff = mmdata->off[i + 1] - mmdata->off[i];
7079 
7080     if (mmdata->mptmp[i]) continue;
7081     if (noff) {
7082       PetscInt nown = mmdata->own[i + 1] - mmdata->own[i];
7083 
7084       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], noff, mmdata->off[i], mmdata->coo_w + n_o));
7085       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], nown, mmdata->own[i], mmdata->coo_v + n_d));
7086       n_o += noff;
7087       n_d += nown;
7088     } else {
7089       Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mmdata->mp[i]->data;
7090 
7091       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], mm->nz, NULL, mmdata->coo_v + n_d));
7092       n_d += mm->nz;
7093     }
7094   }
7095   if (mmdata->hasoffproc) { /* offprocess insertion */
7096     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7097     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7098   }
7099   PetscCall(MatSetValuesCOO(C, mmdata->coo_v, INSERT_VALUES));
7100   PetscFunctionReturn(PETSC_SUCCESS);
7101 }
7102 
7103 /* Support for Pt * A, A * P, or Pt * A * P */
7104 #define MAX_NUMBER_INTERMEDIATE 4
7105 PetscErrorCode MatProductSymbolic_MPIAIJBACKEND(Mat C)
7106 {
7107   Mat_Product           *product = C->product;
7108   Mat                    A, P, mp[MAX_NUMBER_INTERMEDIATE]; /* A, P and a series of intermediate matrices */
7109   Mat_MPIAIJ            *a, *p;
7110   MatMatMPIAIJBACKEND   *mmdata;
7111   ISLocalToGlobalMapping P_oth_l2g = NULL;
7112   IS                     glob      = NULL;
7113   const char            *prefix;
7114   char                   pprefix[256];
7115   const PetscInt        *globidx, *P_oth_idx;
7116   PetscInt               i, j, cp, m, n, M, N, *coo_i, *coo_j;
7117   PetscCount             ncoo, ncoo_d, ncoo_o, ncoo_oown;
7118   PetscInt               cmapt[MAX_NUMBER_INTERMEDIATE], rmapt[MAX_NUMBER_INTERMEDIATE]; /* col/row map type for each Mat in mp[]. */
7119                                                                                          /* type-0: consecutive, start from 0; type-1: consecutive with */
7120                                                                                          /* a base offset; type-2: sparse with a local to global map table */
7121   const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE], *rmapa[MAX_NUMBER_INTERMEDIATE];       /* col/row local to global map array (table) for type-2 map type */
7122 
7123   MatProductType ptype;
7124   PetscBool      mptmp[MAX_NUMBER_INTERMEDIATE], hasoffproc = PETSC_FALSE, iscuda, iship, iskokk;
7125   PetscMPIInt    size;
7126 
7127   PetscFunctionBegin;
7128   MatCheckProduct(C, 1);
7129   PetscCheck(!product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
7130   ptype = product->type;
7131   if (product->A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
7132     ptype                                          = MATPRODUCT_AB;
7133     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
7134   }
7135   switch (ptype) {
7136   case MATPRODUCT_AB:
7137     A          = product->A;
7138     P          = product->B;
7139     m          = A->rmap->n;
7140     n          = P->cmap->n;
7141     M          = A->rmap->N;
7142     N          = P->cmap->N;
7143     hasoffproc = PETSC_FALSE; /* will not scatter mat product values to other processes */
7144     break;
7145   case MATPRODUCT_AtB:
7146     P          = product->A;
7147     A          = product->B;
7148     m          = P->cmap->n;
7149     n          = A->cmap->n;
7150     M          = P->cmap->N;
7151     N          = A->cmap->N;
7152     hasoffproc = PETSC_TRUE;
7153     break;
7154   case MATPRODUCT_PtAP:
7155     A          = product->A;
7156     P          = product->B;
7157     m          = P->cmap->n;
7158     n          = P->cmap->n;
7159     M          = P->cmap->N;
7160     N          = P->cmap->N;
7161     hasoffproc = PETSC_TRUE;
7162     break;
7163   default:
7164     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7165   }
7166   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)C), &size));
7167   if (size == 1) hasoffproc = PETSC_FALSE;
7168 
7169   /* defaults */
7170   for (i = 0; i < MAX_NUMBER_INTERMEDIATE; i++) {
7171     mp[i]    = NULL;
7172     mptmp[i] = PETSC_FALSE;
7173     rmapt[i] = -1;
7174     cmapt[i] = -1;
7175     rmapa[i] = NULL;
7176     cmapa[i] = NULL;
7177   }
7178 
7179   /* customization */
7180   PetscCall(PetscNew(&mmdata));
7181   mmdata->reusesym = product->api_user;
7182   if (ptype == MATPRODUCT_AB) {
7183     if (product->api_user) {
7184       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
7185       PetscCall(PetscOptionsBool("-matmatmult_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7186       PetscCall(PetscOptionsBool("-matmatmult_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7187       PetscOptionsEnd();
7188     } else {
7189       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
7190       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7191       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7192       PetscOptionsEnd();
7193     }
7194   } else if (ptype == MATPRODUCT_PtAP) {
7195     if (product->api_user) {
7196       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
7197       PetscCall(PetscOptionsBool("-matptap_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7198       PetscOptionsEnd();
7199     } else {
7200       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
7201       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7202       PetscOptionsEnd();
7203     }
7204   }
7205   a = (Mat_MPIAIJ *)A->data;
7206   p = (Mat_MPIAIJ *)P->data;
7207   PetscCall(MatSetSizes(C, m, n, M, N));
7208   PetscCall(PetscLayoutSetUp(C->rmap));
7209   PetscCall(PetscLayoutSetUp(C->cmap));
7210   PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
7211   PetscCall(MatGetOptionsPrefix(C, &prefix));
7212 
7213   cp = 0;
7214   switch (ptype) {
7215   case MATPRODUCT_AB: /* A * P */
7216     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7217 
7218     /* A_diag * P_local (merged or not) */
7219     if (mmdata->abmerge) { /* P's diagonal and off-diag blocks are merged to one matrix, then multiplied by A_diag */
7220       /* P is product->B */
7221       PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7222       PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7223       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7224       PetscCall(MatProductSetFill(mp[cp], product->fill));
7225       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7226       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7227       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7228       mp[cp]->product->api_user = product->api_user;
7229       PetscCall(MatProductSetFromOptions(mp[cp]));
7230       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7231       PetscCall(ISGetIndices(glob, &globidx));
7232       rmapt[cp] = 1;
7233       cmapt[cp] = 2;
7234       cmapa[cp] = globidx;
7235       mptmp[cp] = PETSC_FALSE;
7236       cp++;
7237     } else { /* A_diag * P_diag and A_diag * P_off */
7238       PetscCall(MatProductCreate(a->A, p->A, NULL, &mp[cp]));
7239       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7240       PetscCall(MatProductSetFill(mp[cp], product->fill));
7241       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7242       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7243       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7244       mp[cp]->product->api_user = product->api_user;
7245       PetscCall(MatProductSetFromOptions(mp[cp]));
7246       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7247       rmapt[cp] = 1;
7248       cmapt[cp] = 1;
7249       mptmp[cp] = PETSC_FALSE;
7250       cp++;
7251       PetscCall(MatProductCreate(a->A, p->B, NULL, &mp[cp]));
7252       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7253       PetscCall(MatProductSetFill(mp[cp], product->fill));
7254       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7255       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7256       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7257       mp[cp]->product->api_user = product->api_user;
7258       PetscCall(MatProductSetFromOptions(mp[cp]));
7259       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7260       rmapt[cp] = 1;
7261       cmapt[cp] = 2;
7262       cmapa[cp] = p->garray;
7263       mptmp[cp] = PETSC_FALSE;
7264       cp++;
7265     }
7266 
7267     /* A_off * P_other */
7268     if (mmdata->P_oth) {
7269       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g)); /* make P_oth use local col ids */
7270       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7271       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)(a->B))->type_name));
7272       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7273       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7274       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7275       PetscCall(MatProductSetFill(mp[cp], product->fill));
7276       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7277       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7278       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7279       mp[cp]->product->api_user = product->api_user;
7280       PetscCall(MatProductSetFromOptions(mp[cp]));
7281       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7282       rmapt[cp] = 1;
7283       cmapt[cp] = 2;
7284       cmapa[cp] = P_oth_idx;
7285       mptmp[cp] = PETSC_FALSE;
7286       cp++;
7287     }
7288     break;
7289 
7290   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
7291     /* A is product->B */
7292     PetscCall(MatMPIAIJGetLocalMatMerge(A, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7293     if (A == P) { /* when A==P, we can take advantage of the already merged mmdata->Bloc */
7294       PetscCall(MatProductCreate(mmdata->Bloc, mmdata->Bloc, NULL, &mp[cp]));
7295       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7296       PetscCall(MatProductSetFill(mp[cp], product->fill));
7297       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7298       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7299       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7300       mp[cp]->product->api_user = product->api_user;
7301       PetscCall(MatProductSetFromOptions(mp[cp]));
7302       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7303       PetscCall(ISGetIndices(glob, &globidx));
7304       rmapt[cp] = 2;
7305       rmapa[cp] = globidx;
7306       cmapt[cp] = 2;
7307       cmapa[cp] = globidx;
7308       mptmp[cp] = PETSC_FALSE;
7309       cp++;
7310     } else {
7311       PetscCall(MatProductCreate(p->A, mmdata->Bloc, NULL, &mp[cp]));
7312       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7313       PetscCall(MatProductSetFill(mp[cp], product->fill));
7314       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7315       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7316       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7317       mp[cp]->product->api_user = product->api_user;
7318       PetscCall(MatProductSetFromOptions(mp[cp]));
7319       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7320       PetscCall(ISGetIndices(glob, &globidx));
7321       rmapt[cp] = 1;
7322       cmapt[cp] = 2;
7323       cmapa[cp] = globidx;
7324       mptmp[cp] = PETSC_FALSE;
7325       cp++;
7326       PetscCall(MatProductCreate(p->B, mmdata->Bloc, NULL, &mp[cp]));
7327       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7328       PetscCall(MatProductSetFill(mp[cp], product->fill));
7329       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7330       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7331       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7332       mp[cp]->product->api_user = product->api_user;
7333       PetscCall(MatProductSetFromOptions(mp[cp]));
7334       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7335       rmapt[cp] = 2;
7336       rmapa[cp] = p->garray;
7337       cmapt[cp] = 2;
7338       cmapa[cp] = globidx;
7339       mptmp[cp] = PETSC_FALSE;
7340       cp++;
7341     }
7342     break;
7343   case MATPRODUCT_PtAP:
7344     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7345     /* P is product->B */
7346     PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7347     PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7348     PetscCall(MatProductSetType(mp[cp], MATPRODUCT_PtAP));
7349     PetscCall(MatProductSetFill(mp[cp], product->fill));
7350     PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7351     PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7352     PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7353     mp[cp]->product->api_user = product->api_user;
7354     PetscCall(MatProductSetFromOptions(mp[cp]));
7355     PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7356     PetscCall(ISGetIndices(glob, &globidx));
7357     rmapt[cp] = 2;
7358     rmapa[cp] = globidx;
7359     cmapt[cp] = 2;
7360     cmapa[cp] = globidx;
7361     mptmp[cp] = PETSC_FALSE;
7362     cp++;
7363     if (mmdata->P_oth) {
7364       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g));
7365       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7366       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)(a->B))->type_name));
7367       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7368       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7369       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7370       PetscCall(MatProductSetFill(mp[cp], product->fill));
7371       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7372       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7373       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7374       mp[cp]->product->api_user = product->api_user;
7375       PetscCall(MatProductSetFromOptions(mp[cp]));
7376       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7377       mptmp[cp] = PETSC_TRUE;
7378       cp++;
7379       PetscCall(MatProductCreate(mmdata->Bloc, mp[1], NULL, &mp[cp]));
7380       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7381       PetscCall(MatProductSetFill(mp[cp], product->fill));
7382       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7383       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7384       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7385       mp[cp]->product->api_user = product->api_user;
7386       PetscCall(MatProductSetFromOptions(mp[cp]));
7387       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7388       rmapt[cp] = 2;
7389       rmapa[cp] = globidx;
7390       cmapt[cp] = 2;
7391       cmapa[cp] = P_oth_idx;
7392       mptmp[cp] = PETSC_FALSE;
7393       cp++;
7394     }
7395     break;
7396   default:
7397     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7398   }
7399   /* sanity check */
7400   if (size > 1)
7401     for (i = 0; i < cp; i++) PetscCheck(rmapt[i] != 2 || hasoffproc, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected offproc map type for product %" PetscInt_FMT, i);
7402 
7403   PetscCall(PetscMalloc2(cp, &mmdata->mp, cp, &mmdata->mptmp));
7404   for (i = 0; i < cp; i++) {
7405     mmdata->mp[i]    = mp[i];
7406     mmdata->mptmp[i] = mptmp[i];
7407   }
7408   mmdata->cp             = cp;
7409   C->product->data       = mmdata;
7410   C->product->destroy    = MatDestroy_MatMatMPIAIJBACKEND;
7411   C->ops->productnumeric = MatProductNumeric_MPIAIJBACKEND;
7412 
7413   /* memory type */
7414   mmdata->mtype = PETSC_MEMTYPE_HOST;
7415   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iscuda, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, ""));
7416   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iship, MATSEQAIJHIPSPARSE, MATMPIAIJHIPSPARSE, ""));
7417   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iskokk, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, ""));
7418   if (iscuda) mmdata->mtype = PETSC_MEMTYPE_CUDA;
7419   else if (iship) mmdata->mtype = PETSC_MEMTYPE_HIP;
7420   else if (iskokk) mmdata->mtype = PETSC_MEMTYPE_KOKKOS;
7421 
7422   /* prepare coo coordinates for values insertion */
7423 
7424   /* count total nonzeros of those intermediate seqaij Mats
7425     ncoo_d:    # of nonzeros of matrices that do not have offproc entries
7426     ncoo_o:    # of nonzeros (of matrices that might have offproc entries) that will be inserted to remote procs
7427     ncoo_oown: # of nonzeros (of matrices that might have offproc entries) that will be inserted locally
7428   */
7429   for (cp = 0, ncoo_d = 0, ncoo_o = 0, ncoo_oown = 0; cp < mmdata->cp; cp++) {
7430     Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mp[cp]->data;
7431     if (mptmp[cp]) continue;
7432     if (rmapt[cp] == 2 && hasoffproc) { /* the rows need to be scatter to all processes (might include self) */
7433       const PetscInt *rmap = rmapa[cp];
7434       const PetscInt  mr   = mp[cp]->rmap->n;
7435       const PetscInt  rs   = C->rmap->rstart;
7436       const PetscInt  re   = C->rmap->rend;
7437       const PetscInt *ii   = mm->i;
7438       for (i = 0; i < mr; i++) {
7439         const PetscInt gr = rmap[i];
7440         const PetscInt nz = ii[i + 1] - ii[i];
7441         if (gr < rs || gr >= re) ncoo_o += nz; /* this row is offproc */
7442         else ncoo_oown += nz;                  /* this row is local */
7443       }
7444     } else ncoo_d += mm->nz;
7445   }
7446 
7447   /*
7448     ncoo: total number of nonzeros (including those inserted by remote procs) belonging to this proc
7449 
7450     ncoo = ncoo_d + ncoo_oown + ncoo2, which ncoo2 is number of nonzeros inserted to me by other procs.
7451 
7452     off[0] points to a big index array, which is shared by off[1,2,...]. Similarly, for own[0].
7453 
7454     off[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert to others
7455     own[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert locally
7456     so, off[p+1]-off[p] is the number of nonzeros that mp[p] will send to others.
7457 
7458     coo_i/j/v[]: [ncoo] row/col/val of nonzeros belonging to this proc.
7459     Ex. coo_i[]: the beginning part (of size ncoo_d + ncoo_oown) stores i of local nonzeros, and the remaining part stores i of nonzeros I will receive.
7460   */
7461   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->off)); /* +1 to make a csr-like data structure */
7462   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->own));
7463 
7464   /* gather (i,j) of nonzeros inserted by remote procs */
7465   if (hasoffproc) {
7466     PetscSF  msf;
7467     PetscInt ncoo2, *coo_i2, *coo_j2;
7468 
7469     PetscCall(PetscMalloc1(ncoo_o, &mmdata->off[0]));
7470     PetscCall(PetscMalloc1(ncoo_oown, &mmdata->own[0]));
7471     PetscCall(PetscMalloc2(ncoo_o, &coo_i, ncoo_o, &coo_j)); /* to collect (i,j) of entries to be sent to others */
7472 
7473     for (cp = 0, ncoo_o = 0; cp < mmdata->cp; cp++) {
7474       Mat_SeqAIJ *mm     = (Mat_SeqAIJ *)mp[cp]->data;
7475       PetscInt   *idxoff = mmdata->off[cp];
7476       PetscInt   *idxown = mmdata->own[cp];
7477       if (!mptmp[cp] && rmapt[cp] == 2) { /* row map is sparse */
7478         const PetscInt *rmap = rmapa[cp];
7479         const PetscInt *cmap = cmapa[cp];
7480         const PetscInt *ii   = mm->i;
7481         PetscInt       *coi  = coo_i + ncoo_o;
7482         PetscInt       *coj  = coo_j + ncoo_o;
7483         const PetscInt  mr   = mp[cp]->rmap->n;
7484         const PetscInt  rs   = C->rmap->rstart;
7485         const PetscInt  re   = C->rmap->rend;
7486         const PetscInt  cs   = C->cmap->rstart;
7487         for (i = 0; i < mr; i++) {
7488           const PetscInt *jj = mm->j + ii[i];
7489           const PetscInt  gr = rmap[i];
7490           const PetscInt  nz = ii[i + 1] - ii[i];
7491           if (gr < rs || gr >= re) { /* this is an offproc row */
7492             for (j = ii[i]; j < ii[i + 1]; j++) {
7493               *coi++    = gr;
7494               *idxoff++ = j;
7495             }
7496             if (!cmapt[cp]) { /* already global */
7497               for (j = 0; j < nz; j++) *coj++ = jj[j];
7498             } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7499               for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7500             } else { /* offdiag */
7501               for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7502             }
7503             ncoo_o += nz;
7504           } else { /* this is a local row */
7505             for (j = ii[i]; j < ii[i + 1]; j++) *idxown++ = j;
7506           }
7507         }
7508       }
7509       mmdata->off[cp + 1] = idxoff;
7510       mmdata->own[cp + 1] = idxown;
7511     }
7512 
7513     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7514     PetscCall(PetscSFSetGraphLayout(mmdata->sf, C->rmap, ncoo_o /*nleaves*/, NULL /*ilocal*/, PETSC_OWN_POINTER, coo_i));
7515     PetscCall(PetscSFGetMultiSF(mmdata->sf, &msf));
7516     PetscCall(PetscSFGetGraph(msf, &ncoo2 /*nroots*/, NULL, NULL, NULL));
7517     ncoo = ncoo_d + ncoo_oown + ncoo2;
7518     PetscCall(PetscMalloc2(ncoo, &coo_i2, ncoo, &coo_j2));
7519     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown)); /* put (i,j) of remote nonzeros at back */
7520     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown));
7521     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7522     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7523     PetscCall(PetscFree2(coo_i, coo_j));
7524     /* allocate MPI send buffer to collect nonzero values to be sent to remote procs */
7525     PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo_o * sizeof(PetscScalar), (void **)&mmdata->coo_w));
7526     coo_i = coo_i2;
7527     coo_j = coo_j2;
7528   } else { /* no offproc values insertion */
7529     ncoo = ncoo_d;
7530     PetscCall(PetscMalloc2(ncoo, &coo_i, ncoo, &coo_j));
7531 
7532     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7533     PetscCall(PetscSFSetGraph(mmdata->sf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
7534     PetscCall(PetscSFSetUp(mmdata->sf));
7535   }
7536   mmdata->hasoffproc = hasoffproc;
7537 
7538   /* gather (i,j) of nonzeros inserted locally */
7539   for (cp = 0, ncoo_d = 0; cp < mmdata->cp; cp++) {
7540     Mat_SeqAIJ     *mm   = (Mat_SeqAIJ *)mp[cp]->data;
7541     PetscInt       *coi  = coo_i + ncoo_d;
7542     PetscInt       *coj  = coo_j + ncoo_d;
7543     const PetscInt *jj   = mm->j;
7544     const PetscInt *ii   = mm->i;
7545     const PetscInt *cmap = cmapa[cp];
7546     const PetscInt *rmap = rmapa[cp];
7547     const PetscInt  mr   = mp[cp]->rmap->n;
7548     const PetscInt  rs   = C->rmap->rstart;
7549     const PetscInt  re   = C->rmap->rend;
7550     const PetscInt  cs   = C->cmap->rstart;
7551 
7552     if (mptmp[cp]) continue;
7553     if (rmapt[cp] == 1) { /* consecutive rows */
7554       /* fill coo_i */
7555       for (i = 0; i < mr; i++) {
7556         const PetscInt gr = i + rs;
7557         for (j = ii[i]; j < ii[i + 1]; j++) coi[j] = gr;
7558       }
7559       /* fill coo_j */
7560       if (!cmapt[cp]) { /* type-0, already global */
7561         PetscCall(PetscArraycpy(coj, jj, mm->nz));
7562       } else if (cmapt[cp] == 1) {                        /* type-1, local to global for consecutive columns of C */
7563         for (j = 0; j < mm->nz; j++) coj[j] = jj[j] + cs; /* lid + col start */
7564       } else {                                            /* type-2, local to global for sparse columns */
7565         for (j = 0; j < mm->nz; j++) coj[j] = cmap[jj[j]];
7566       }
7567       ncoo_d += mm->nz;
7568     } else if (rmapt[cp] == 2) { /* sparse rows */
7569       for (i = 0; i < mr; i++) {
7570         const PetscInt *jj = mm->j + ii[i];
7571         const PetscInt  gr = rmap[i];
7572         const PetscInt  nz = ii[i + 1] - ii[i];
7573         if (gr >= rs && gr < re) { /* local rows */
7574           for (j = ii[i]; j < ii[i + 1]; j++) *coi++ = gr;
7575           if (!cmapt[cp]) { /* type-0, already global */
7576             for (j = 0; j < nz; j++) *coj++ = jj[j];
7577           } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7578             for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7579           } else { /* type-2, local to global for sparse columns */
7580             for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7581           }
7582           ncoo_d += nz;
7583         }
7584       }
7585     }
7586   }
7587   if (glob) PetscCall(ISRestoreIndices(glob, &globidx));
7588   PetscCall(ISDestroy(&glob));
7589   if (P_oth_l2g) PetscCall(ISLocalToGlobalMappingRestoreIndices(P_oth_l2g, &P_oth_idx));
7590   PetscCall(ISLocalToGlobalMappingDestroy(&P_oth_l2g));
7591   /* allocate an array to store all nonzeros (inserted locally or remotely) belonging to this proc */
7592   PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo * sizeof(PetscScalar), (void **)&mmdata->coo_v));
7593 
7594   /* preallocate with COO data */
7595   PetscCall(MatSetPreallocationCOO(C, ncoo, coo_i, coo_j));
7596   PetscCall(PetscFree2(coo_i, coo_j));
7597   PetscFunctionReturn(PETSC_SUCCESS);
7598 }
7599 
7600 PetscErrorCode MatProductSetFromOptions_MPIAIJBACKEND(Mat mat)
7601 {
7602   Mat_Product *product = mat->product;
7603 #if defined(PETSC_HAVE_DEVICE)
7604   PetscBool match  = PETSC_FALSE;
7605   PetscBool usecpu = PETSC_FALSE;
7606 #else
7607   PetscBool match = PETSC_TRUE;
7608 #endif
7609 
7610   PetscFunctionBegin;
7611   MatCheckProduct(mat, 1);
7612 #if defined(PETSC_HAVE_DEVICE)
7613   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match));
7614   if (match) { /* we can always fallback to the CPU if requested */
7615     switch (product->type) {
7616     case MATPRODUCT_AB:
7617       if (product->api_user) {
7618         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
7619         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7620         PetscOptionsEnd();
7621       } else {
7622         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
7623         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7624         PetscOptionsEnd();
7625       }
7626       break;
7627     case MATPRODUCT_AtB:
7628       if (product->api_user) {
7629         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
7630         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7631         PetscOptionsEnd();
7632       } else {
7633         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
7634         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7635         PetscOptionsEnd();
7636       }
7637       break;
7638     case MATPRODUCT_PtAP:
7639       if (product->api_user) {
7640         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
7641         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7642         PetscOptionsEnd();
7643       } else {
7644         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
7645         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7646         PetscOptionsEnd();
7647       }
7648       break;
7649     default:
7650       break;
7651     }
7652     match = (PetscBool)!usecpu;
7653   }
7654 #endif
7655   if (match) {
7656     switch (product->type) {
7657     case MATPRODUCT_AB:
7658     case MATPRODUCT_AtB:
7659     case MATPRODUCT_PtAP:
7660       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJBACKEND;
7661       break;
7662     default:
7663       break;
7664     }
7665   }
7666   /* fallback to MPIAIJ ops */
7667   if (!mat->ops->productsymbolic) PetscCall(MatProductSetFromOptions_MPIAIJ(mat));
7668   PetscFunctionReturn(PETSC_SUCCESS);
7669 }
7670 
7671 /*
7672    Produces a set of block column indices of the matrix row, one for each block represented in the original row
7673 
7674    n - the number of block indices in cc[]
7675    cc - the block indices (must be large enough to contain the indices)
7676 */
7677 static inline PetscErrorCode MatCollapseRow(Mat Amat, PetscInt row, PetscInt bs, PetscInt *n, PetscInt *cc)
7678 {
7679   PetscInt        cnt = -1, nidx, j;
7680   const PetscInt *idx;
7681 
7682   PetscFunctionBegin;
7683   PetscCall(MatGetRow(Amat, row, &nidx, &idx, NULL));
7684   if (nidx) {
7685     cnt     = 0;
7686     cc[cnt] = idx[0] / bs;
7687     for (j = 1; j < nidx; j++) {
7688       if (cc[cnt] < idx[j] / bs) cc[++cnt] = idx[j] / bs;
7689     }
7690   }
7691   PetscCall(MatRestoreRow(Amat, row, &nidx, &idx, NULL));
7692   *n = cnt + 1;
7693   PetscFunctionReturn(PETSC_SUCCESS);
7694 }
7695 
7696 /*
7697     Produces a set of block column indices of the matrix block row, one for each block represented in the original set of rows
7698 
7699     ncollapsed - the number of block indices
7700     collapsed - the block indices (must be large enough to contain the indices)
7701 */
7702 static inline PetscErrorCode MatCollapseRows(Mat Amat, PetscInt start, PetscInt bs, PetscInt *w0, PetscInt *w1, PetscInt *w2, PetscInt *ncollapsed, PetscInt **collapsed)
7703 {
7704   PetscInt i, nprev, *cprev = w0, ncur = 0, *ccur = w1, *merged = w2, *cprevtmp;
7705 
7706   PetscFunctionBegin;
7707   PetscCall(MatCollapseRow(Amat, start, bs, &nprev, cprev));
7708   for (i = start + 1; i < start + bs; i++) {
7709     PetscCall(MatCollapseRow(Amat, i, bs, &ncur, ccur));
7710     PetscCall(PetscMergeIntArray(nprev, cprev, ncur, ccur, &nprev, &merged));
7711     cprevtmp = cprev;
7712     cprev    = merged;
7713     merged   = cprevtmp;
7714   }
7715   *ncollapsed = nprev;
7716   if (collapsed) *collapsed = cprev;
7717   PetscFunctionReturn(PETSC_SUCCESS);
7718 }
7719 
7720 /*
7721  MatCreateGraph_Simple_AIJ - create simple scalar matrix (graph) from potentially blocked matrix
7722 
7723  Input Parameter:
7724  . Amat - matrix
7725  - symmetrize - make the result symmetric
7726  + scale - scale with diagonal
7727 
7728  Output Parameter:
7729  . a_Gmat - output scalar graph >= 0
7730 
7731 */
7732 PETSC_INTERN PetscErrorCode MatCreateGraph_Simple_AIJ(Mat Amat, PetscBool symmetrize, PetscBool scale, PetscReal filter, Mat *a_Gmat)
7733 {
7734   PetscInt  Istart, Iend, Ii, jj, kk, ncols, nloc, NN, MM, bs;
7735   MPI_Comm  comm;
7736   Mat       Gmat;
7737   PetscBool ismpiaij, isseqaij;
7738   Mat       a, b, c;
7739   MatType   jtype;
7740 
7741   PetscFunctionBegin;
7742   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
7743   PetscCall(MatGetOwnershipRange(Amat, &Istart, &Iend));
7744   PetscCall(MatGetSize(Amat, &MM, &NN));
7745   PetscCall(MatGetBlockSize(Amat, &bs));
7746   nloc = (Iend - Istart) / bs;
7747 
7748   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATSEQAIJ, &isseqaij));
7749   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATMPIAIJ, &ismpiaij));
7750   PetscCheck(isseqaij || ismpiaij, comm, PETSC_ERR_USER, "Require (MPI)AIJ matrix type");
7751 
7752   /* TODO GPU: these calls are potentially expensive if matrices are large and we want to use the GPU */
7753   /* A solution consists in providing a new API, MatAIJGetCollapsedAIJ, and each class can provide a fast
7754      implementation */
7755   if (bs > 1) {
7756     PetscCall(MatGetType(Amat, &jtype));
7757     PetscCall(MatCreate(comm, &Gmat));
7758     PetscCall(MatSetType(Gmat, jtype));
7759     PetscCall(MatSetSizes(Gmat, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE));
7760     PetscCall(MatSetBlockSizes(Gmat, 1, 1));
7761     if (isseqaij || ((Mat_MPIAIJ *)Amat->data)->garray) {
7762       PetscInt  *d_nnz, *o_nnz;
7763       MatScalar *aa, val, *AA;
7764       PetscInt  *aj, *ai, *AJ, nc, nmax = 0;
7765       if (isseqaij) {
7766         a = Amat;
7767         b = NULL;
7768       } else {
7769         Mat_MPIAIJ *d = (Mat_MPIAIJ *)Amat->data;
7770         a             = d->A;
7771         b             = d->B;
7772       }
7773       PetscCall(PetscInfo(Amat, "New bs>1 Graph. nloc=%" PetscInt_FMT "\n", nloc));
7774       PetscCall(PetscMalloc2(nloc, &d_nnz, isseqaij ? 0 : nloc, &o_nnz));
7775       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7776         PetscInt       *nnz = (c == a) ? d_nnz : o_nnz;
7777         const PetscInt *cols1, *cols2;
7778         for (PetscInt brow = 0, nc1, nc2, ok = 1; brow < nloc * bs; brow += bs) { // block rows
7779           PetscCall(MatGetRow(c, brow, &nc2, &cols2, NULL));
7780           nnz[brow / bs] = nc2 / bs;
7781           if (nc2 % bs) ok = 0;
7782           if (nnz[brow / bs] > nmax) nmax = nnz[brow / bs];
7783           for (PetscInt ii = 1; ii < bs; ii++) { // check for non-dense blocks
7784             PetscCall(MatGetRow(c, brow + ii, &nc1, &cols1, NULL));
7785             if (nc1 != nc2) ok = 0;
7786             else {
7787               for (PetscInt jj = 0; jj < nc1 && ok == 1; jj++) {
7788                 if (cols1[jj] != cols2[jj]) ok = 0;
7789                 if (cols1[jj] % bs != jj % bs) ok = 0;
7790               }
7791             }
7792             PetscCall(MatRestoreRow(c, brow + ii, &nc1, &cols1, NULL));
7793           }
7794           PetscCall(MatRestoreRow(c, brow, &nc2, &cols2, NULL));
7795           if (!ok) {
7796             PetscCall(PetscFree2(d_nnz, o_nnz));
7797             PetscCall(PetscInfo(Amat, "Found sparse blocks - revert to slow method\n"));
7798             goto old_bs;
7799           }
7800         }
7801       }
7802       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7803       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7804       PetscCall(PetscFree2(d_nnz, o_nnz));
7805       PetscCall(PetscMalloc2(nmax, &AA, nmax, &AJ));
7806       // diag
7807       for (PetscInt brow = 0, n, grow; brow < nloc * bs; brow += bs) { // block rows
7808         Mat_SeqAIJ *aseq = (Mat_SeqAIJ *)a->data;
7809         ai               = aseq->i;
7810         n                = ai[brow + 1] - ai[brow];
7811         aj               = aseq->j + ai[brow];
7812         for (int k = 0; k < n; k += bs) {        // block columns
7813           AJ[k / bs] = aj[k] / bs + Istart / bs; // diag starts at (Istart,Istart)
7814           val        = 0;
7815           for (int ii = 0; ii < bs; ii++) { // rows in block
7816             aa = aseq->a + ai[brow + ii] + k;
7817             for (int jj = 0; jj < bs; jj++) {         // columns in block
7818               val += PetscAbs(PetscRealPart(aa[jj])); // a sort of norm
7819             }
7820           }
7821           PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%d) >= nmax (%d)", (int)(k / bs), (int)nmax);
7822           AA[k / bs] = val;
7823         }
7824         grow = Istart / bs + brow / bs;
7825         PetscCall(MatSetValues(Gmat, 1, &grow, n / bs, AJ, AA, INSERT_VALUES));
7826       }
7827       // off-diag
7828       if (ismpiaij) {
7829         Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)Amat->data;
7830         const PetscScalar *vals;
7831         const PetscInt    *cols, *garray = aij->garray;
7832         PetscCheck(garray, PETSC_COMM_SELF, PETSC_ERR_USER, "No garray ?");
7833         for (PetscInt brow = 0, grow; brow < nloc * bs; brow += bs) { // block rows
7834           PetscCall(MatGetRow(b, brow, &ncols, &cols, NULL));
7835           for (int k = 0, cidx = 0; k < ncols; k += bs, cidx++) {
7836             PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs >= nmax");
7837             AA[k / bs] = 0;
7838             AJ[cidx]   = garray[cols[k]] / bs;
7839           }
7840           nc = ncols / bs;
7841           PetscCall(MatRestoreRow(b, brow, &ncols, &cols, NULL));
7842           for (int ii = 0; ii < bs; ii++) { // rows in block
7843             PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7844             for (int k = 0; k < ncols; k += bs) {
7845               for (int jj = 0; jj < bs; jj++) { // cols in block
7846                 PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%d) >= nmax (%d)", (int)(k / bs), (int)nmax);
7847                 AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7848               }
7849             }
7850             PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7851           }
7852           grow = Istart / bs + brow / bs;
7853           PetscCall(MatSetValues(Gmat, 1, &grow, nc, AJ, AA, INSERT_VALUES));
7854         }
7855       }
7856       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7857       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7858       PetscCall(PetscFree2(AA, AJ));
7859     } else {
7860       const PetscScalar *vals;
7861       const PetscInt    *idx;
7862       PetscInt          *d_nnz, *o_nnz, *w0, *w1, *w2;
7863     old_bs:
7864       /*
7865        Determine the preallocation needed for the scalar matrix derived from the vector matrix.
7866        */
7867       PetscCall(PetscInfo(Amat, "OLD bs>1 CreateGraph\n"));
7868       PetscCall(PetscMalloc2(nloc, &d_nnz, isseqaij ? 0 : nloc, &o_nnz));
7869       if (isseqaij) {
7870         PetscInt max_d_nnz;
7871         /*
7872          Determine exact preallocation count for (sequential) scalar matrix
7873          */
7874         PetscCall(MatSeqAIJGetMaxRowNonzeros(Amat, &max_d_nnz));
7875         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7876         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7877         for (Ii = 0, jj = 0; Ii < Iend; Ii += bs, jj++) PetscCall(MatCollapseRows(Amat, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7878         PetscCall(PetscFree3(w0, w1, w2));
7879       } else if (ismpiaij) {
7880         Mat             Daij, Oaij;
7881         const PetscInt *garray;
7882         PetscInt        max_d_nnz;
7883         PetscCall(MatMPIAIJGetSeqAIJ(Amat, &Daij, &Oaij, &garray));
7884         /*
7885          Determine exact preallocation count for diagonal block portion of scalar matrix
7886          */
7887         PetscCall(MatSeqAIJGetMaxRowNonzeros(Daij, &max_d_nnz));
7888         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7889         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7890         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) PetscCall(MatCollapseRows(Daij, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7891         PetscCall(PetscFree3(w0, w1, w2));
7892         /*
7893          Over estimate (usually grossly over), preallocation count for off-diagonal portion of scalar matrix
7894          */
7895         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) {
7896           o_nnz[jj] = 0;
7897           for (kk = 0; kk < bs; kk++) { /* rows that get collapsed to a single row */
7898             PetscCall(MatGetRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7899             o_nnz[jj] += ncols;
7900             PetscCall(MatRestoreRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7901           }
7902           if (o_nnz[jj] > (NN / bs - nloc)) o_nnz[jj] = NN / bs - nloc;
7903         }
7904       } else SETERRQ(comm, PETSC_ERR_USER, "Require AIJ matrix type");
7905       /* get scalar copy (norms) of matrix */
7906       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7907       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7908       PetscCall(PetscFree2(d_nnz, o_nnz));
7909       for (Ii = Istart; Ii < Iend; Ii++) {
7910         PetscInt dest_row = Ii / bs;
7911         PetscCall(MatGetRow(Amat, Ii, &ncols, &idx, &vals));
7912         for (jj = 0; jj < ncols; jj++) {
7913           PetscInt    dest_col = idx[jj] / bs;
7914           PetscScalar sv       = PetscAbs(PetscRealPart(vals[jj]));
7915           PetscCall(MatSetValues(Gmat, 1, &dest_row, 1, &dest_col, &sv, ADD_VALUES));
7916         }
7917         PetscCall(MatRestoreRow(Amat, Ii, &ncols, &idx, &vals));
7918       }
7919       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7920       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7921     }
7922   } else {
7923     if (symmetrize || filter >= 0 || scale) PetscCall(MatDuplicate(Amat, MAT_COPY_VALUES, &Gmat));
7924     else {
7925       Gmat = Amat;
7926       PetscCall(PetscObjectReference((PetscObject)Gmat));
7927     }
7928     if (isseqaij) {
7929       a = Gmat;
7930       b = NULL;
7931     } else {
7932       Mat_MPIAIJ *d = (Mat_MPIAIJ *)Gmat->data;
7933       a             = d->A;
7934       b             = d->B;
7935     }
7936     if (filter >= 0 || scale) {
7937       /* take absolute value of each entry */
7938       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7939         MatInfo      info;
7940         PetscScalar *avals;
7941         PetscCall(MatGetInfo(c, MAT_LOCAL, &info));
7942         PetscCall(MatSeqAIJGetArray(c, &avals));
7943         for (int jj = 0; jj < info.nz_used; jj++) avals[jj] = PetscAbsScalar(avals[jj]);
7944         PetscCall(MatSeqAIJRestoreArray(c, &avals));
7945       }
7946     }
7947   }
7948   if (symmetrize) {
7949     PetscBool isset, issym;
7950     PetscCall(MatIsSymmetricKnown(Amat, &isset, &issym));
7951     if (!isset || !issym) {
7952       Mat matTrans;
7953       PetscCall(MatTranspose(Gmat, MAT_INITIAL_MATRIX, &matTrans));
7954       PetscCall(MatAXPY(Gmat, 1.0, matTrans, Gmat->structurally_symmetric == PETSC_BOOL3_TRUE ? SAME_NONZERO_PATTERN : DIFFERENT_NONZERO_PATTERN));
7955       PetscCall(MatDestroy(&matTrans));
7956     }
7957     PetscCall(MatSetOption(Gmat, MAT_SYMMETRIC, PETSC_TRUE));
7958   } else if (Amat != Gmat) PetscCall(MatPropagateSymmetryOptions(Amat, Gmat));
7959   if (scale) {
7960     /* scale c for all diagonal values = 1 or -1 */
7961     Vec diag;
7962     PetscCall(MatCreateVecs(Gmat, &diag, NULL));
7963     PetscCall(MatGetDiagonal(Gmat, diag));
7964     PetscCall(VecReciprocal(diag));
7965     PetscCall(VecSqrtAbs(diag));
7966     PetscCall(MatDiagonalScale(Gmat, diag, diag));
7967     PetscCall(VecDestroy(&diag));
7968   }
7969   PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_graph_view"));
7970 
7971   if (filter >= 0) {
7972     PetscCall(MatFilter(Gmat, filter, PETSC_TRUE, PETSC_TRUE));
7973     PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_filter_graph_view"));
7974   }
7975   *a_Gmat = Gmat;
7976   PetscFunctionReturn(PETSC_SUCCESS);
7977 }
7978 
7979 /*
7980     Special version for direct calls from Fortran
7981 */
7982 #include <petsc/private/fortranimpl.h>
7983 
7984 /* Change these macros so can be used in void function */
7985 /* Identical to PetscCallVoid, except it assigns to *_ierr */
7986 #undef PetscCall
7987 #define PetscCall(...) \
7988   do { \
7989     PetscErrorCode ierr_msv_mpiaij = __VA_ARGS__; \
7990     if (PetscUnlikely(ierr_msv_mpiaij)) { \
7991       *_ierr = PetscError(PETSC_COMM_SELF, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr_msv_mpiaij, PETSC_ERROR_REPEAT, " "); \
7992       return; \
7993     } \
7994   } while (0)
7995 
7996 #undef SETERRQ
7997 #define SETERRQ(comm, ierr, ...) \
7998   do { \
7999     *_ierr = PetscError(comm, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr, PETSC_ERROR_INITIAL, __VA_ARGS__); \
8000     return; \
8001   } while (0)
8002 
8003 #if defined(PETSC_HAVE_FORTRAN_CAPS)
8004   #define matsetvaluesmpiaij_ MATSETVALUESMPIAIJ
8005 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
8006   #define matsetvaluesmpiaij_ matsetvaluesmpiaij
8007 #else
8008 #endif
8009 PETSC_EXTERN void matsetvaluesmpiaij_(Mat *mmat, PetscInt *mm, const PetscInt im[], PetscInt *mn, const PetscInt in[], const PetscScalar v[], InsertMode *maddv, PetscErrorCode *_ierr)
8010 {
8011   Mat         mat = *mmat;
8012   PetscInt    m = *mm, n = *mn;
8013   InsertMode  addv = *maddv;
8014   Mat_MPIAIJ *aij  = (Mat_MPIAIJ *)mat->data;
8015   PetscScalar value;
8016 
8017   MatCheckPreallocated(mat, 1);
8018   if (mat->insertmode == NOT_SET_VALUES) mat->insertmode = addv;
8019   else PetscCheck(mat->insertmode == addv, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Cannot mix add values and insert values");
8020   {
8021     PetscInt  i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
8022     PetscInt  cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
8023     PetscBool roworiented = aij->roworiented;
8024 
8025     /* Some Variables required in the macro */
8026     Mat         A     = aij->A;
8027     Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
8028     PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
8029     MatScalar  *aa;
8030     PetscBool   ignorezeroentries = (((a->ignorezeroentries) && (addv == ADD_VALUES)) ? PETSC_TRUE : PETSC_FALSE);
8031     Mat         B                 = aij->B;
8032     Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
8033     PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
8034     MatScalar  *ba;
8035     /* This variable below is only for the PETSC_HAVE_VIENNACL or PETSC_HAVE_CUDA cases, but we define it in all cases because we
8036      * cannot use "#if defined" inside a macro. */
8037     PETSC_UNUSED PetscBool inserted = PETSC_FALSE;
8038 
8039     PetscInt  *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
8040     PetscInt   nonew = a->nonew;
8041     MatScalar *ap1, *ap2;
8042 
8043     PetscFunctionBegin;
8044     PetscCall(MatSeqAIJGetArray(A, &aa));
8045     PetscCall(MatSeqAIJGetArray(B, &ba));
8046     for (i = 0; i < m; i++) {
8047       if (im[i] < 0) continue;
8048       PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
8049       if (im[i] >= rstart && im[i] < rend) {
8050         row      = im[i] - rstart;
8051         lastcol1 = -1;
8052         rp1      = aj + ai[row];
8053         ap1      = aa + ai[row];
8054         rmax1    = aimax[row];
8055         nrow1    = ailen[row];
8056         low1     = 0;
8057         high1    = nrow1;
8058         lastcol2 = -1;
8059         rp2      = bj + bi[row];
8060         ap2      = ba + bi[row];
8061         rmax2    = bimax[row];
8062         nrow2    = bilen[row];
8063         low2     = 0;
8064         high2    = nrow2;
8065 
8066         for (j = 0; j < n; j++) {
8067           if (roworiented) value = v[i * n + j];
8068           else value = v[i + j * m];
8069           if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
8070           if (in[j] >= cstart && in[j] < cend) {
8071             col = in[j] - cstart;
8072             MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
8073           } else if (in[j] < 0) continue;
8074           else if (PetscUnlikelyDebug(in[j] >= mat->cmap->N)) {
8075             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
8076           } else {
8077             if (mat->was_assembled) {
8078               if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
8079 #if defined(PETSC_USE_CTABLE)
8080               PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col));
8081               col--;
8082 #else
8083               col = aij->colmap[in[j]] - 1;
8084 #endif
8085               if (col < 0 && !((Mat_SeqAIJ *)(aij->A->data))->nonew) {
8086                 PetscCall(MatDisAssemble_MPIAIJ(mat));
8087                 col = in[j];
8088                 /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
8089                 B        = aij->B;
8090                 b        = (Mat_SeqAIJ *)B->data;
8091                 bimax    = b->imax;
8092                 bi       = b->i;
8093                 bilen    = b->ilen;
8094                 bj       = b->j;
8095                 rp2      = bj + bi[row];
8096                 ap2      = ba + bi[row];
8097                 rmax2    = bimax[row];
8098                 nrow2    = bilen[row];
8099                 low2     = 0;
8100                 high2    = nrow2;
8101                 bm       = aij->B->rmap->n;
8102                 ba       = b->a;
8103                 inserted = PETSC_FALSE;
8104               }
8105             } else col = in[j];
8106             MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
8107           }
8108         }
8109       } else if (!aij->donotstash) {
8110         if (roworiented) {
8111           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, v + i * n, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8112         } else {
8113           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, v + i, m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8114         }
8115       }
8116     }
8117     PetscCall(MatSeqAIJRestoreArray(A, &aa));
8118     PetscCall(MatSeqAIJRestoreArray(B, &ba));
8119   }
8120   PetscFunctionReturnVoid();
8121 }
8122 
8123 /* Undefining these here since they were redefined from their original definition above! No
8124  * other PETSc functions should be defined past this point, as it is impossible to recover the
8125  * original definitions */
8126 #undef PetscCall
8127 #undef SETERRQ
8128